import numpy as np
import pandas as pd

population = pd.read_csv('Population.csv')
mortality = pd.read_csv('MortalityRateAttributedToWater.csv')
political_stability = pd.read_csv('PoliticalStability.csv')
water_service = pd.read_csv('BasicAndSafelyManagedDrinkingWaterServices.csv')
region = pd.read_csv('RegionCountry.csv')

population.head(5)

population.dtypes

Country         object
Granularity     object
Year             int64
Population     float64
dtype: object

print('Granularity : ', list(population['Granularity'].unique()))

Granularity :  ['Total', 'Male', 'Female', 'Rural', 'Urban']

print('Year : ', list(population['Year'].unique()))

Year :  [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

#Affichage nombre de doublons 

def doublons(df): 
    print(len(df)-len(df.drop_duplicates()), 'doublons')

doublons(population)

0 doublons

population.isna().sum()

Country        0
Granularity    0
Year           0
Population     0
dtype: int64

population = population[~population['Granularity'].isin(['Male','Female'])]
population = population.reset_index(drop=True)
population

print('Nombre de pays :', population['Country'].nunique())

Nombre de pays : 239

population['Population']=population['Population']*1000
population

population_mondiale = population[(population['Granularity']=='Total')&(population['Year']==2018)]
population_mondiale

print('Nombre de population en 2018 : ', population_mondiale['Population'].sum())

Nombre de population en 2018 :  9090746135.0

population_mondiale.sort_values(by='Population', ascending=False).head(10)

population = population.drop(population[population['Country']=='China, mainland'].index).reset_index(drop=True)
population.sort_values(by='Population', ascending=False).head(10)

population

population_mondiale2 = population[(population['Granularity']=='Total')&(population['Year']==2018)]
population_mondiale2

print('Nombre de population en 2018 : ', population_mondiale2['Population'].sum())

Nombre de population en 2018 :  7663098349.0

population['Population'].max()

1459377612.0

population['Population'].min()

0.0

mortality.head(5)

4824.353/35383032.0 *100000 
#Afghanistan 2016

13.634651207957534

mortality.dtypes

Year                                                               int64
Country                                                           object
Granularity                                                       object
Mortality rate attributed to exposure to unsafe WASH services    float64
WASH deaths                                                      float64
dtype: object

print('Granularity : ', list(mortality['Granularity'].unique()))

Granularity :  ['Female', 'Male', 'Total']

print('Year : ', list(mortality['Year'].unique()))

Year :  [2016]

doublons(mortality)

0 doublons

mortality.isna().sum()

Year                                                               0
Country                                                            0
Granularity                                                        0
Mortality rate attributed to exposure to unsafe WASH services      0
WASH deaths                                                      366
dtype: int64

missing_rows = mortality[mortality.isna().any(axis=1)]
print(missing_rows)

     Year      Country Granularity  \
0    2016  Afghanistan      Female   
1    2016  Afghanistan        Male   
3    2016      Albania      Female   
4    2016      Albania        Male   
6    2016      Algeria      Female   
..    ...          ...         ...   
541  2016        Yemen        Male   
543  2016       Zambia      Female   
544  2016       Zambia        Male   
546  2016     Zimbabwe      Female   
547  2016     Zimbabwe        Male   

     Mortality rate attributed to exposure to unsafe WASH services  \
0                                             15.31193               
1                                             12.61297               
3                                              0.12552               
4                                              0.20650               
6                                              2.19890               
..                                                 ...               
541                                            8.66892               
543                                           33.23213               
544                                           36.61913               
546                                           22.16388               
547                                           27.06688               

     WASH deaths  
0            NaN  
1            NaN  
3            NaN  
4            NaN  
6            NaN  
..           ...  
541          NaN  
543          NaN  
544          NaN  
546          NaN  
547          NaN  

[366 rows x 5 columns]

mortality = mortality[(mortality['Granularity']=='Total')].reset_index(drop=True)
mortality

mortality.isna().sum()

Year                                                             0
Country                                                          0
Granularity                                                      0
Mortality rate attributed to exposure to unsafe WASH services    0
WASH deaths                                                      0
dtype: int64

print('Nombre de pays :', mortality['Country'].nunique())

Nombre de pays : 183

mortality.describe()

political_stability.head(5)

political_stability.dtypes

Country                 object
Year                     int64
Political_Stability    float64
Granularity             object
dtype: object

print('Granularity : ', list(political_stability['Granularity'].unique()))

Granularity :  ['Total']

print('Year : ', list(political_stability['Year'].unique()))

Year :  [2000, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

#filtrer les années 2000 et 2002
political_stability2 = political_stability[(political_stability['Year']==2000)|(political_stability['Year']==2002)]
political_stability2.head(6)

#calculer les moyennes par pays pour les colonnes numériques 
avg_2001 = political_stability2.groupby('Country').mean(numeric_only=True).reset_index()
avg_2001.head(6)

#2001 doit être un int
avg_2001= avg_2001.astype({'Year':int})
avg_2001.dtypes

Country                 object
Year                     int32
Political_Stability    float64
dtype: object

#ajout de 2001 dans la table 
political_stability = pd.concat([political_stability,avg_2001], ignore_index=True)
political_stability

#mettre les nan en total
political_stability['Granularity']= political_stability['Granularity'].replace(np.nan,'Total')
political_stability

#trier par année pour plus de lisibilité 
political_stability= political_stability.sort_values(by=['Country','Year']).reset_index(drop=True)
political_stability

print('Year : ', list(political_stability['Year'].unique()))

Year :  [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

doublons(political_stability)

0 doublons

political_stability.isna().sum()

Country                0
Year                   0
Political_Stability    0
Granularity            0
dtype: int64

print('Nombre de pays :', political_stability['Country'].nunique())

Nombre de pays : 200

political_stability.describe()

water_service.head(5)

water_service.dtypes

Year                                                             int64
Country                                                         object
Granularity                                                     object
Population using at least basic drinking-water services (%)    float64
Population using safely managed drinking-water services (%)    float64
dtype: object

print('Granularity : ', list(water_service['Granularity'].unique()))

Granularity :  ['Rural', 'Total', 'Urban']

print('Year : ', list(water_service['Year'].unique()))

Year :  [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]

doublons(water_service)

0 doublons

water_service.isna().sum()

Year                                                              0
Country                                                           0
Granularity                                                       0
Population using at least basic drinking-water services (%)    1061
Population using safely managed drinking-water services (%)    7190
dtype: int64

missing_rows = water_service[water_service.isna().any(axis=1)]
print(missing_rows)

       Year      Country Granularity  \
0      2000  Afghanistan       Rural   
1      2000  Afghanistan       Total   
2      2000  Afghanistan       Urban   
3      2000      Albania       Rural   
5      2000      Albania       Urban   
...     ...          ...         ...   
10470  2017       Zambia       Rural   
10471  2017       Zambia       Total   
10473  2017     Zimbabwe       Rural   
10474  2017     Zimbabwe       Total   
10475  2017     Zimbabwe       Urban   

       Population using at least basic drinking-water services (%)  \
0                                               21.61913             
1                                               27.77190             
2                                               49.48745             
3                                               81.78472             
5                                               96.35529             
...                                                  ...             
10470                                           41.95206             
10471                                           59.96376             
10473                                           49.80476             
10474                                           64.05123             
10475                                           93.99767             

       Population using safely managed drinking-water services (%)  
0                                                    NaN            
1                                                    NaN            
2                                                    NaN            
3                                                    NaN            
5                                                    NaN            
...                                                  ...            
10470                                                NaN            
10471                                                NaN            
10473                                                NaN            
10474                                                NaN            
10475                                                NaN            

[7190 rows x 5 columns]

pourcentage_nan = water_service.isna().sum()/len(water_service) *100
print(pourcentage_nan)

Year                                                            0.000000
Country                                                         0.000000
Granularity                                                     0.000000
Population using at least basic drinking-water services (%)    10.127911
Population using safely managed drinking-water services (%)    68.633066
dtype: float64

print('Nombre de pays :', water_service['Country'].nunique())

Nombre de pays : 194

water_service.describe()

water_service_pivot = water_service.copy()

water_service_pivot = water_service_pivot.rename( columns = {
    'Population using at least basic drinking-water services (%)' : 'Taux eau potable',
    'Population using safely managed drinking-water services (%)' : 'Taux eau sûre'})

water_service_pivot

water_service_pivot = water_service_pivot.melt(
    id_vars = ['Year', 'Country', 'Granularity'],
    value_vars=['Taux eau potable', 'Taux eau sûre'],
    var_name='Type eau',
    value_name='Taux'
)

water_service_pivot

region.head(5)

region = region.rename(columns={'REGION (DISPLAY)':'Region', 'COUNTRY (DISPLAY)': 'Country'})
region.head(5)

region.dtypes

Region     object
Country    object
dtype: object

doublons(region)

0 doublons

region.isna().sum()

Region     0
Country    0
dtype: int64

print('Nombre de pays :', region['Country'].nunique())

Nombre de pays : 194

print('Region : ', list(region['Region'].unique()))

Region :  ['Europe', 'Western Pacific', 'Eastern Mediterranean', 'South-East Asia', 'Africa', 'Americas']

diff_countries = region['Country'].isin(population['Country']) == False
region[diff_countries]

diff_countries = population_mondiale2['Country'].isin(region['Country']) == False

population_mondiale2[diff_countries]

diff_countries = region['Country'].isin(mortality['Country']) == False
region[diff_countries]

diff_countries = mortality['Country'].isin(region['Country']) == False
mortality[diff_countries]

diff_countries = region['Country'].isin(political_stability['Country']) == False
region[diff_countries]

diff_countries = political_stability['Country'].isin(region['Country']) == False

df = political_stability[(political_stability['Year']==2000)&(diff_countries)]
df

diff_countries = region['Country'].isin(water_service['Country']) == False
region[diff_countries]

diff_countries = water_service['Country'].isin(region['Country']) == False
water_service[diff_countries]

population_mondiale2[population_mondiale2['Country'].str.contains('China',case=False,na=False)]

region[region['Country'].str.contains('macedonia',case=False,na=False)]

political_stability["Country"]= political_stability["Country"].replace({"China, mainland":"China"})

political_stability["Country"]= political_stability["Country"].replace({"North Macedonia":"Republic of North Macedonia"})

population["Country"]= population["Country"].replace({"North Macedonia":"Republic of North Macedonia"})

region.to_csv("Region_clean.csv", index=False)

population.to_csv("Population_clean.csv", index=False)

political_stability.to_csv("Political_stability_clean.csv", index=False)

mortality.to_csv("Mortality_clean.csv", index=False)

water_service.to_csv("Water_service_clean.csv", index=False)

water_service_pivot.to_csv("Water_service_pivot.csv", index=False)

	Year	Mortality rate attributed to exposure to unsafe WASH services	WASH deaths
count	183.0	183.000000	183.000000
mean	2016.0	12.492651	4756.097706
std	0.0	20.819238	21280.125369
min	2016.0	0.006270	0.082290
25%	2016.0	0.199985	11.163275
50%	2016.0	1.332760	130.983400
75%	2016.0	17.819700	1950.433500
max	2016.0	101.043120	246087.900000

	Country	Year	Political_Stability
0	Afghanistan	2001.0	-2.240
1	Albania	2001.0	-0.415
2	Algeria	2001.0	-1.530
3	Andorra	2001.0	1.225
4	Angola	2001.0	-1.810
5	Antigua and Barbuda	2001.0	0.750

	Year	Political_Stability
count	3712.000000	3712.000000
mean	2009.094828	-0.049203
std	5.449472	0.995483
min	2000.000000	-3.310000
25%	2004.000000	-0.710000
50%	2009.000000	0.050000
75%	2014.000000	0.800000
max	2018.000000	1.970000

	Year	Population using at least basic drinking-water services (%)	Population using safely managed drinking-water services (%)
count	10476.000000	9415.000000	3286.000000
mean	2008.500000	83.962120	66.070856
std	5.188375	19.968269	30.383942
min	2000.000000	4.082620	0.000000
25%	2004.000000	75.928395	41.895583
50%	2008.500000	93.115400	73.966655
75%	2013.000000	98.954240	94.776640
max	2017.000000	100.000010	100.000000

	Country	Granularity	Year	Population
225	American Samoa	Total	2018	55465.0
396	Anguilla	Total	2018	14731.0
624	Aruba	Total	2018	105845.0
1308	Bermuda	Total	2018	62756.0
1446	Bonaire, Sint Eustatius and Saba	Total	2018	25711.0
1674	British Virgin Islands	Total	2018	29802.0
2187	Cayman Islands	Total	2018	64174.0
2358	Channel Islands	Total	2018	170499.0
2529	China, Hong Kong SAR	Total	2018	7371730.0
2586	China, Macao SAR	Total	2018	631636.0
2643	China, Taiwan Province of	Total	2018	23726460.0
3123	Curaçao	Total	2018	162752.0
4092	Falkland Islands (Malvinas)	Total	2018	3234.0
4149	Faroe Islands	Total	2018	48497.0
4377	French Guyana	Total	2018	282938.0
4434	French Polynesia	Total	2018	277679.0
4776	Gibraltar	Total	2018	33718.0
4890	Greenland	Total	2018	56564.0
5004	Guadeloupe	Total	2018	399848.0
5061	Guam	Total	2018	165768.0
5403	Holy See	Total	2018	801.0
5916	Isle of Man	Total	2018	84077.0
6885	Liechtenstein	Total	2018	37910.0
7455	Martinique	Total	2018	375673.0
7626	Mayotte	Total	2018	259531.0
7950	Montserrat	Total	2018	4993.0
8406	Netherlands Antilles (former)	Total	2018	277483.0
8463	New Caledonia	Total	2018	279993.0
8805	North Macedonia	Total	2018	2082957.0
8862	Northern Mariana Islands	Total	2018	56882.0
9147	Palestine	Total	2018	4862979.0
9603	Puerto Rico	Total	2018	3039596.0
9831	Réunion	Total	2018	882526.0
10012	Saint Barthélemy	Total	2018	9816.0
10067	Saint Helena, Ascension and Tristan da Cunha	Total	2018	6035.0
10238	Saint Pierre and Miquelon	Total	2018	5849.0
10305	Saint-Martin (French part)	Total	2018	37264.0
10840	Sint Maarten (Dutch part)	Total	2018	41940.0
11830	Tokelau	Total	2018	1319.0
12172	Turks and Caicos Islands	Total	2018	37665.0
12628	United States Virgin Islands	Total	2018	104680.0
12970	Wallis and Futuna Islands	Total	2018	11661.0
13027	Western Sahara	Total	2018	567402.0

Etude sur l'eau potable¶

Préparation des données¶

1. Importation¶

2. Nettoyage¶

Table population¶

Table mortality¶

Table political_stability¶

Table water_service¶

Table region¶

Vérification correspondance Country entre les df¶

3. Jointures de Region avec les autres fichiers ?¶

4. Exportation¶

	Country	Granularity	Year	Population
0	Afghanistan	Total	2000	20779.953
1	Afghanistan	Male	2000	10689.508
2	Afghanistan	Female	2000	10090.449
3	Afghanistan	Rural	2000	15657.474
4	Afghanistan	Urban	2000	4436.282

	Country	Granularity	Year	Population
2472	China	Total	2018	1.459378e+09
2643	China, mainland	Total	2018	1.427648e+09
5688	India	Total	2018	1.352642e+09
12628	United States of America	Total	2018	3.270963e+08
5745	Indonesia	Total	2018	2.676705e+08
9090	Pakistan	Total	2018	2.122283e+08
1617	Brazil	Total	2018	2.094693e+08
8748	Nigeria	Total	2018	1.958747e+08
966	Bangladesh	Total	2018	1.613767e+08
10002	Russian Federation	Total	2018	1.457340e+08

	Year	Country	Granularity	Mortality rate attributed to exposure to unsafe WASH services	WASH deaths
0	2016	Afghanistan	Female	15.31193	NaN
1	2016	Afghanistan	Male	12.61297	NaN
2	2016	Afghanistan	Total	13.92067	4824.353
3	2016	Albania	Female	0.12552	NaN
4	2016	Albania	Male	0.20650	NaN

	Country	Year	Political_Stability	Granularity
0	Afghanistan	2000	-2.44	Total
1	Afghanistan	2002	-2.04	Total
2	Afghanistan	2003	-2.20	Total
3	Afghanistan	2004	-2.30	Total
4	Afghanistan	2005	-2.07	Total

	Country	Year	Political_Stability	Granularity
0	Afghanistan	2000	-2.440	Total
1	Afghanistan	2002	-2.040	Total
2	Afghanistan	2003	-2.200	Total
3	Afghanistan	2004	-2.300	Total
4	Afghanistan	2005	-2.070	Total
...	...	...	...	...
3707	Venezuela (Bolivarian Republic of)	2001	-1.040	NaN
3708	Viet Nam	2001	0.380	NaN
3709	Yemen	2001	-1.225	NaN
3710	Zambia	2001	-0.125	NaN
3711	Zimbabwe	2001	-1.420	NaN

	REGION (DISPLAY)	COUNTRY (DISPLAY)
0	Europe	Albania
1	Europe	Andorra
2	Europe	Armenia
3	Western Pacific	Australia
4	Europe	Austria

	Country	Year	Political_Stability	Granularity
376	Bermuda	2000	0.71	Total
699	China, Hong Kong SAR	2000	0.93	Total
718	China, Macao SAR	2000	0.49	Total
737	China, Taiwan Province of	2000	0.54	Total
756	China, mainland	2000	-0.21	Total
2443	North Macedonia	2000	-0.62	Total
2535	Palestine	2000	-1.55	Total
2687	Puerto Rico	2000	0.26	Total