|
# COVID-19 case study |
|
|
|
# import packages |
|
import pandas as pd |
|
import pdpipe as pdp |
|
import numpy as np |
|
|
|
from sklearn import preprocessing |
|
import time |
|
from datetime import datetime |
|
|
|
# get data |
|
confirmed_ts_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv") |
|
deaths_ts_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") |
|
recovered_ts_df = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv") |
|
|
|
# melt the data to long form (from the wide TS format) |
|
confirmed_ts_melted_df = confirmed_ts_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long',], |
|
var_name='covid_date', value_name='confirmed').copy() |
|
|
|
deaths_ts_melted_df = deaths_ts_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long',], |
|
var_name='covid_date', value_name='deaths').copy() |
|
|
|
recovered_ts_melted_df = recovered_ts_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long',], |
|
var_name='covid_date', value_name='recovered').copy() |
|
|
|
# merge data into a single data frame |
|
pipeline = pdp.PdPipeline([ |
|
pdp.ApplyByCols(['covid_date'], pd.to_datetime), |
|
pdp.ColRename({'Country/Region': 'country_id', 'Province/State': 'state', 'Lat': 'lat', 'Long': 'long'}) |
|
]) |
|
|
|
confirmed_ts_fcg_df = pipeline.apply(confirmed_ts_melted_df).sort_values(by=['country_id', 'covid_date']) |
|
deaths_ts_fcg_df = pipeline.apply(deaths_ts_melted_df).sort_values(by=['country_id', 'covid_date']) |
|
recovered_ts_fcg_df = pipeline.apply(recovered_ts_melted_df).sort_values(by=['country_id', 'covid_date']) |
|
|
|
covid_df = pd.merge( |
|
pd.merge(confirmed_ts_fcg_df, deaths_ts_fcg_df, on=['state', 'country_id', 'lat', 'long','covid_date']), |
|
recovered_ts_fcg_df, on=['state', 'country_id', 'lat', 'long','covid_date']) |
|
|
|
# Feature Engineering - Datetime-related attributes, lags, trends etc. |
|
le = preprocessing.LabelEncoder() |
|
|
|
covid_df_corr = covid_df.copy() |
|
covid_df_corr['day_num'] = le.fit_transform(covid_df_corr.covid_date) |
|
covid_df_corr['day'] = covid_df_corr['covid_date'].dt.day |
|
covid_df_corr['month'] = covid_df_corr['covid_date'].dt.month |
|
covid_df_corr['year'] = covid_df_corr['covid_date'].dt.year |
|
|
|
# Fill null values given that we merged train-test datasets |
|
covid_df_corr['state'].fillna("None", inplace=True) |
|
covid_df_corr['confirmed'].fillna(0, inplace=True) |
|
covid_df_corr['deaths'].fillna(0, inplace=True) |
|
covid_df_corr['recovered'].fillna(0, inplace=True) |
|
|
|
# lag and trend calculation |
|
def calculate_trend(df, lag_list, column): |
|
for lag in lag_list: |
|
trend_column_lag = "Trend_" + column + "_" + str(lag) |
|
df[trend_column_lag] = (df[column]-df[column].shift(lag, fill_value=-999))/df[column].shift(lag, fill_value=0) |
|
|
|
# df[col1] = df[column] + df[column2] |
|
return df |
|
|
|
def calculate_lag(df, lag_list, column): |
|
for lag in lag_list: |
|
column_lag = "Lag_" + column + "_" + str(lag) |
|
df[column_lag] = df[column].shift(lag, fill_value=0) |
|
return df |
|
|
|
ts = time.time() |
|
covid_df_corr = calculate_lag(covid_df_corr, range(1,7), 'confirmed') |
|
covid_df_corr = calculate_lag(covid_df_corr, range(1,7), 'deaths') |
|
covid_df_corr = calculate_lag(covid_df_corr, range(1,7), 'recovered') |
|
covid_df_corr = calculate_trend(covid_df_corr, range(1,7), 'confirmed') |
|
covid_df_corr = calculate_trend(covid_df_corr, range(1,7), 'deaths') |
|
covid_df_corr = calculate_trend(covid_df_corr, range(1,7), 'recovered') |
|
|
|
covid_df_corr.replace([np.inf, -np.inf], 0, inplace=True) |
|
covid_df_corr.fillna(0, inplace=True) |
|
print("Time spent: ", time.time()-ts) |