Links

Get Covid19 data

Tags: #johnshopkins #opendata #analytics #csv
Author: Florent Ravenel
Last update: 2023-04-12 (Created: 2022-03-07)
Description: This notebook provides an easy way to access and analyze Covid19 data from Johns Hopkins University.

Input

Import libraries

import pandas as pd
import naas

Variables

# Input URLs of the raw csv dataset
urls = [
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
]
# Output paths
title = "DB_COVID19_JHU"
output_csv = f"{title}.csv"

Model

Get data from JHU

def get_data_url(urls):
df = pd.DataFrame()
for url in urls:
tmp_df = pd.read_csv(url)
tmp_df["Indicator"] = (
url.split("/time_series_covid19_")[-1].split("_global.csv")[0].capitalize()
)
df = pd.concat([df, tmp_df])
return df
df_init = get_data_url(urls)
df_init

Get all data from JHU

def get_all_data(df_init):
df = df_init.copy()
# Cleaning
df = df.drop("Province/State", axis=1)
# Melt data
df = pd.melt(
df,
id_vars=["Country/Region", "Lat", "Long", "Indicator"],
var_name="Date",
value_name="Value",
).fillna(0)
df["Date"] = pd.to_datetime(df["Date"])
# Calc active cases
df_active = df.copy()
df_active.loc[
df_active["Indicator"].isin(["Deaths", "Recovered"]), "Value"
] = df_active["Value"] * (-1)
df_active["Indicator"] = "Active cases"
# Concat data
df = pd.concat([df, df_active])
# Group by country/region
to_group = ["Country/Region", "Lat", "Long", "Indicator", "Date"]
df = df.groupby(to_group, as_index=False).agg({"Value": "sum"})
# Cleaning
df = df.rename(columns={"Country/Region": "COUNTRY"})
df.columns = df.columns.str.upper()
return df.reset_index(drop=True)
df_clean = get_all_data(df_init)
df_clean

Output

Save dataframe in csv

df_clean.to_csv(output_csv, index=False)