This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BertForQuestionAnswering( | |
(bert): BertModel( | |
(embeddings): BertEmbeddings( | |
(word_embeddings): Embedding(30522, 768, padding_idx=0) | |
(position_embeddings): Embedding(512, 768) | |
(token_type_embeddings): Embedding(2, 768) | |
(LayerNorm): BertLayerNorm() | |
(dropout): Dropout(p=0.1) | |
) | |
(encoder): BertEncoder( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
%matplotlib inline | |
from sklearn.metrics import r2_score | |
# importing R functions | |
#!pip install rpy2 | |
import rpy2.robjects as robjects | |
r_predict = robjects.r["predict"] | |
r_lm = robjects.r["lm"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# prepare train data | |
df_train_summary = df_train_2.groupby(['date']) \ | |
.agg({'price': np.mean}).reset_index() | |
min_date = df_train_summary.date.min() | |
df_train_summary.date = df_train_summary.date - min_date | |
df_train_summary.date = df_train_summary.date.dt.days | |
df_train_summary.sample(frac=.01) | |
# prepare test data | |
df_test_2 = convert_date(df_test) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# load the data | |
df_train = pd.read_csv('calendar_train.csv') | |
df_test = pd.read_csv('calendar_test.csv') | |
# convert dates | |
def convert_date(df): | |
df = df[~ df.price.isnull()] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Missing Not at Random (MNAR) | |
# randomly mark half of x1 samples as missing MNAR | |
# depending on unrecorded predictor x3 | |
x3 = np.random.uniform(0, 1, 100) | |
idx_mnar = x3 > .5 | |
fig, ax = plt.subplots(1,2,figsize=(15,5)) | |
ax[0].scatter(x1, y, label='data') | |
ax[0].scatter(x1[idx_mnar], y[idx_mnar], label='missing', color='red') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Missing at Random (MAR) | |
# randomly mark half of x1 samples as missing MAR | |
# depending on value of recorded predictor x2 | |
idx_mar = x2 == 1 | |
fig, ax = plt.subplots(1,2,figsize=(15,5)) | |
ax[0].scatter(x1, y, label='data') | |
ax[0].scatter(x1[idx_mar], y[idx_mar], label='missing', color='red') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Missing Completely at Random (MCAR) | |
# randomly mark half of x1 samples as missing MCAR | |
# independend of any information recorded | |
idx_mcar= np.random.choice([0, 1], size=(100,)) == 1 | |
plt.scatter(x1,y, label='data') | |
plt.scatter(x1[idx_mcar],y[idx_mcar], label='missing (MCAR)', color='red') | |
plt.xlabel('x1') | |
plt.ylabel('y') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
## create a synthetic dataset | |
# measured predictors x1, x2 | |
x1 = np.sort(np.random.uniform(-2.0, 3.0, 100)) # random real numbers | |
x2 = np.random.choice([0, 1], size=(100,)) # random binary numbers |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
coef_perc_df = coef_df.copy() | |
cols = coef_df.columns.difference(['drop_rows']).values | |
for col in cols: | |
coef_perc_df[col] = np.round(100*(coef_df[col]/coef_df['drop_rows']-1)) | |
coef_perc_df[['drop_rows','mean','model_basic','model_progressive']] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get index of strategies | |
lr_results_df = pd.DataFrame(lr_results) | |
strategies = lr_results_df['imputation strategy'] | |
# get a boolean array where True => standardized | |
standardized = lr_results_df['standardized'] | |
st = lambda s: ' standardized' if s else '' | |
coefs_ = {} | |
for key, value in enumerate(strategies): | |
if value == 'drop_cols': |