World population and density
‚Äč‚Äč
‚Äč
‚Äč
Tags: #worldbank #opendata
Author: Jeremy Ravenel‚Äč
Objective
This graph tends to show the population repartition in the world by region. The ordinate measures the growth in population for one year, the abscissa indicates the density, and the cercle shows the number of habitants.
Source United Nations Population Division.

Input

Import library

1
import pandas as pd
2
import numpy as np
3
import plotly.express as px
4
‚Äč
5
# Options pour afficher plus de données sur le retour console
6
# pd.set_option("display.max_rows", 10)
7
# pd.set_option("display.max_columns", 10)
Copied!

Model

Get the data from an excel file

1
years = list(map(lambda a : str(a), range(1950, 2020, 1)))
2
usecols = ["Region, subregion, country or area *", "Country code", "Type", *years]
3
renamed_population_columns = {}
4
renamed_density_columns = {}
5
‚Äč
6
xls_populations = pd.read_excel('https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2019_POP_F01_1_TOTAL_POPULATION_BOTH_SEXES.xlsx',
7
header=16,
8
encoding="utf-8",
9
usecols=usecols)
10
‚Äč
11
# Pour chaque année on vient créer une colonne "population_{année}" dans notre dataset
12
for year in years:
13
xls_populations[year] = pd.to_numeric(xls_populations[year], errors='coerce')
14
renamed_population_columns[year] = f"population_{year}"
15
xls_populations = xls_populations.rename(columns=renamed_population_columns)
16
‚Äč
17
# On récupère seulement les valeurs du type "Country/Area"
18
xls_populations = xls_populations[xls_populations['Type'] == "Country/Area"]
19
‚Äč
20
xls_populations
Copied!
1
xls_density = pd.read_excel('https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2019_POP_F06_POPULATION_DENSITY.xlsx',
2
header=16,
3
encoding="utf-8",
4
usecols=["Region, subregion, country or area *", "Country code", "Type", *years])
5
‚Äč
6
# Pour chaque année on vient créer une colonne "density_{année}" dans notre dataset
7
for year in years:
8
xls_density[year] = pd.to_numeric(xls_density[year], errors='coerce')
9
renamed_density_columns[year] = f"density_{year}"
10
xls_density = xls_density.rename(columns=renamed_density_columns)
11
‚Äč
12
# On récupère seulement les valeurs du type "Country/Area"
13
xls_density = xls_density[xls_density['Type'] == "Country/Area"]
14
‚Äč
15
xls_density
Copied!

Dataset assembling

1
# On vient concatener le dataset "Population" avec le dataset "Densité"
2
result = pd.concat([xls_populations,xls_density], sort=False)
3
n = result.index.nlevels
4
xls_global = result.groupby(level=range(n)).first()
5
‚Äč
6
xls_global
Copied!

Adding the dataset

1
# Pour chaque année on vient comparer la population total d'un pays avec celle de l'année N-1 pour en déduire son évolution sur une année
2
for index, year in enumerate(years):
3
# Suppression des bruits (données non-traitables)
4
if index is 0:
5
continue
6
try:
7
past_year = str(int(year) - 1)
8
xls_global[f'population_growth_{year}'] = (xls_global[f'population_{year}'] - xls_global[f'population_{past_year}']) / xls_global[f'population_{past_year}'] * 100
9
except KeyError:
10
xls_global[f'population_growth_{year}'] = np.nan
11
‚Äč
12
xls_global
Copied!

Creating dataset "Continents et leurs pays"

1
# Récupération des continents via l'API RestCountries
2
countries = pd.read_json('https://restcountries.eu/rest/v2/all?fields=region;numericCode', dtype = {"numericCode": int})
3
countries = countries.rename(columns={"region": "Region", "numericCode" : "Country code"})
4
# Suppression du bruit (données non-traitables)
5
countries= countries.dropna()
6
# On format les données pour qu'elles correspondent au format du dataset global
7
countries['Country code'] = countries['Country code'].replace(regex=r"^0+", value='')
8
countries["Country code"] = countries["Country code"].astype(int)
9
‚Äč
10
countries
Copied!

Add a column "Région" to the global dataset

1
xls_global = xls_global.join(countries.set_index('Country code'), on='Country code')
2
‚Äč
3
xls_global
Copied!

Formating the display

1
# Création de l'ensemble final
2
xls_formatted = pd.DataFrame(columns=['COUNTRY', 'YEAR', 'POPULATION', 'POPULATION GROWTH', 'DENSITY', 'REGION'])
3
‚Äč
4
‚Äč
5
for index, line in xls_global.iterrows():
6
for year in years:
7
# On ignore 1950 car il n'est pas possible de calculer l'évolution sans les données de 1949
8
if year == "1950":
9
continue
10
xls_formatted = xls_formatted.append(
11
{
12
'COUNTRY': line['Region, subregion, country or area *'],
13
'YEAR': year,
14
'POPULATION': line[f"population_{year}"],
15
'POPULATION GROWTH': line[f"population_growth_{year}"],
16
'DENSITY': line[f"density_{year}"],
17
'REGION': line['Region'],
18
}, ignore_index=True)
19
‚Äč
20
# Suppression du bruit (données non-traitables)
21
xls_formatted = xls_formatted.dropna()
22
‚Äč
23
xls_formatted
Copied!

Output

Display the plot with plotly

1
fig = px.scatter(xls_formatted, x="DENSITY", y="POPULATION GROWTH", animation_frame="YEAR", animation_group="COUNTRY",
2
size="POPULATION", color="REGION", hover_name="COUNTRY",
3
log_x=True, size_max=60)
4
fig.show()
Copied!
Copy link
Edit on GitHub