Covid19 Active Cases
Tags: #johnshopkins #opendata #analytics

Input

Import libraries

1
import pandas as pd
2
import plotly.express as px
3
import plotly.graph_objects as go
4
import matplotlib.pyplot as plt
Copied!

Variables

1
# URLs of the raw csv dataset
2
urls = [
3
'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
4
'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
5
'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
6
]
7
8
confirmed_df, deaths_df, recovered_df = tuple(pd.read_csv(url) for url in urls)
Copied!

Model

Mostly adopted from this COVID19 Data Processing Tutorial
Clean the dataset to show the cases by country
Steps:
  1. 1.
    Convert from Wide to Long Dataframe (Convert all datetimes to a single column)
  2. 2.
    Merge/Join the Confirmed, Deaths and Recovered tables into a single table
  3. 3.
    Converting Date from string to datetime
  4. 4.
    Replacing missing values/NaNs
  5. 5.
    Coronavirus cases reported from 3 cruise ships should be treated differently and adjustments need to be made for Canada (deciding to drop Canada due to missing recovery data)
  6. 6.
    Get Active Cases = Confirmed - Deaths - Recovered
1
#Wide to Long DataFrame conversion
2
dates = confirmed_df.columns[4:]
3
confirmed_df_long = confirmed_df.melt(
4
id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
5
value_vars=dates,
6
var_name='Date',
7
value_name='Confirmed'
8
)
9
deaths_df_long = deaths_df.melt(
10
id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
11
value_vars=dates,
12
var_name='Date',
13
value_name='Deaths'
14
)
15
recovered_df_long = recovered_df.melt(
16
id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
17
value_vars=dates,
18
var_name='Date',
19
value_name='Recovered'
20
)
21
22
# Adjust for Canada
23
recovered_df_long = recovered_df_long[(recovered_df_long['Country/Region']!='Canada')]
Copied!
1
# Join into one single dataframe/table
2
# Merging confirmed_df_long and deaths_df_long
3
full_table = confirmed_df_long.merge(
4
right=deaths_df_long,
5
how='left',
6
on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
7
)
8
# Merging full_table and recovered_df_long
9
full_table = full_table.merge(
10
right=recovered_df_long,
11
how='left',
12
on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
13
)
14
15
# Convert date strings to actual dates
16
full_table['Date'] = pd.to_datetime(full_table['Date'])
17
# Handle some missing values / NaNs
18
full_table['Recovered'] = full_table['Recovered'].fillna(0).astype('int64')
Copied!
1
full_table.isna().sum()
2
# full_table.dtypes
Copied!
1
# Adjust for Canada and 3 cruise ships
2
ship_rows = full_table['Province/State'].str.contains('Grand Princess') | full_table['Province/State'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('Diamond Princess') | full_table['Country/Region'].str.contains('MS Zaandam')
3
full_ship = full_table[ship_rows]
4
full_table = full_table[~(ship_rows)]
5
6
# Add one more entry for each day to get the entire world's counts/totals
7
world_dict = {"Country/Region": "World", "Confirmed": pd.Series(full_table.groupby(['Date'])['Confirmed'].sum()), "Deaths": pd.Series(full_table.groupby(['Date'])['Deaths'].sum()),"Recovered": pd.Series(full_table.groupby(['Date'])['Recovered'].sum())}
8
world_df = pd.DataFrame.from_dict(world_dict).reset_index()
9
print(world_df.columns)
10
full_table = pd.concat([full_table, world_df], ignore_index=True)
Copied!
1
# Active Cases = Confirmed - Deaths - Recovered
2
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
3
4
full_grouped = full_table.groupby(['Date', 'Country/Region'])['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
Copied!
1
len(full_grouped["Country/Region"].unique())
Copied!

Interactive Dropdown Visualization for Active Cases by Country

First, need to go back from long to wide for a format suited to the visualization using df.pivot()
Mostly adopted from this Interactive Dropdown Tutorial
1
# Go back from long to wide for viz purposes
2
df = full_grouped
3
df.rename(columns={"Country/Region": "Country"}, inplace=True)
4
df_confirmed = df[["Date", "Country", "Confirmed"]]
5
df_deaths = df[["Date", "Country", "Deaths"]]
6
df_active = df[["Date", "Country", "Active"]]
7
df_recovered = df[["Date", "Country", "Recovered"]]
8
9
df_confirmed = df_confirmed.pivot(index="Date", columns="Country", values="Confirmed")
10
df_deaths = df_deaths.pivot(index="Date", columns="Country", values="Deaths")
11
df_recovered = df_recovered.pivot(index="Date", columns="Country", values="Recovered")
12
df_active = df_active.pivot(index="Date", columns="Country", values="Active")
Copied!
1
def create_layout_button(df, column):
2
first, latest = df.index.values[0], df.index.values[-1]
3
return dict(label = column,
4
method = 'update',
5
args = [{'visible': df.columns.isin([column]),
6
'title': column,
7
'xaxis.range': [first, latest],
8
'showlegend': True
9
}])
10
11
def multi_plot(df, title, addAll = True):
12
first, latest = df.index.values[0], df.index.values[-1]
13
fig = go.Figure()
14
15
for column in df.columns.to_list():
16
fig.add_trace(
17
go.Scatter(
18
x = df.index,
19
y = df[column],
20
name = column
21
)
22
)
23
24
button_all = dict(label = 'All',
25
method = 'update',
26
args = [{'visible': df.columns.isin(df.columns),
27
'title': 'All',
28
'xaxis.range': [first, latest],
29
'showlegend':True}])
30
31
# Need "World" to be the default choice if "All" is not shown
32
button_world = create_layout_button(df, "World")
33
34
fig.update_layout(
35
updatemenus=[{
36
"active": 0,
37
"buttons": ([button_all] * addAll) + [button_world] + [create_layout_button(df, column) for column in df.columns if column != "World"],
38
"showactive": True
39
}
40
],
41
yaxis_type="log"
42
)
43
44
# Update remaining layout properties
45
fig.update_layout(
46
title_text=title,
47
# annotations=[dict(
48
# text="Country:",
49
# x=0, y=0
50
# )]
51
)
52
53
fig.show()
Copied!
1
# test_df_active = df_active.swapaxes("index", "columns")
2
test_df_active = df_active
3
latest = test_df_active.index.values[-1]
4
print(latest)
5
test_df_active = test_df_active.T.sort_values(by=latest, ascending=False).head(11).T
6
test_df_active
Copied!

Output

Logarithmic COVID-19 time series

1
multi_plot(test_df_active, title="Logarithmic COVID-19 time series Active Cases by Country (Top 10)")
Copied!
1
multi_plot(df_active, title="Logarithmic COVID-19 time series Active Cases by Country", addAll=False)
Copied!

World Health Indicator (WHI)

Using a scale of 0 - 10 and rescaling the number of Active Cases / Confirmed Cases on the entire World's Data
(where 0 is the worst and 10 is the best)
\begin{equation*} WHI = 10 - 10 \times \frac{Current - Min}{Max - Min} \end{equation*}
(Using Linear Scaling for now, will discuss and develop a better scaling mechanism if required)
1
# Uncomment to get a 30 day Moving Average Statistics and a health indicator based on that
2
3
# df_active["MonthlyAverage"] = df_active["World"].rolling('30D').mean().astype('int64')
4
# curr_30d = df_active.loc[latest, "MonthlyAverage"]
5
# max_30d = df_active["MonthlyAverage"].max()
6
# min_30d = df_active["MonthlyAverage"].min()
7
# WHI_30d = 10 - 10 * ((curr_30d - min_30d) / (max_30d - min_30d))
8
#print(f"World Health Indicator (30 day Moving Average): {round(WHI_30d, 2)}")
Copied!
1
WHI = 10 - 10 * ((df_active.loc[latest, "World"] - df_active["World"].min()) / (df_active["World"].max() - df_active["World"].min()))
2
3
print(f"World Health Indicator (Raw values): {round(WHI, 2)}")
4
WHI_data = pd.DataFrame.from_dict({"DATE_PROCESSED": pd.to_datetime("today").date(), "INDICATOR": "COVID-19 Active Cases", "VALUE": [round(WHI, 2)]})
5
WHI_data
Copied!
Copy link
Edit on GitHub