Skip to content

Instantly share code, notes, and snippets.

View michelkana's full-sized avatar

Michel Kana michelkana

View GitHub Profile
BertForQuestionAnswering(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): BertLayerNorm()
(dropout): Dropout(p=0.1)
)
(encoder): BertEncoder(
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
# importing R functions
#!pip install rpy2
import rpy2.robjects as robjects
r_predict = robjects.r["predict"]
r_lm = robjects.r["lm"]
# prepare train data
df_train_summary = df_train_2.groupby(['date']) \
.agg({'price': np.mean}).reset_index()
min_date = df_train_summary.date.min()
df_train_summary.date = df_train_summary.date - min_date
df_train_summary.date = df_train_summary.date.dt.days
df_train_summary.sample(frac=.01)
# prepare test data
df_test_2 = convert_date(df_test)
import pandas as pd
import numpy as np
# load the data
df_train = pd.read_csv('calendar_train.csv')
df_test = pd.read_csv('calendar_test.csv')
# convert dates
def convert_date(df):
df = df[~ df.price.isnull()]
## Missing Not at Random (MNAR)
# randomly mark half of x1 samples as missing MNAR
# depending on unrecorded predictor x3
x3 = np.random.uniform(0, 1, 100)
idx_mnar = x3 > .5
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax[0].scatter(x1, y, label='data')
ax[0].scatter(x1[idx_mnar], y[idx_mnar], label='missing', color='red')
## Missing at Random (MAR)
# randomly mark half of x1 samples as missing MAR
# depending on value of recorded predictor x2
idx_mar = x2 == 1
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax[0].scatter(x1, y, label='data')
ax[0].scatter(x1[idx_mar], y[idx_mar], label='missing', color='red')
## Missing Completely at Random (MCAR)
# randomly mark half of x1 samples as missing MCAR
# independend of any information recorded
idx_mcar= np.random.choice([0, 1], size=(100,)) == 1
plt.scatter(x1,y, label='data')
plt.scatter(x1[idx_mcar],y[idx_mcar], label='missing (MCAR)', color='red')
plt.xlabel('x1')
plt.ylabel('y')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
## create a synthetic dataset
# measured predictors x1, x2
x1 = np.sort(np.random.uniform(-2.0, 3.0, 100)) # random real numbers
x2 = np.random.choice([0, 1], size=(100,)) # random binary numbers
coef_perc_df = coef_df.copy()
cols = coef_df.columns.difference(['drop_rows']).values
for col in cols:
coef_perc_df[col] = np.round(100*(coef_df[col]/coef_df['drop_rows']-1))
coef_perc_df[['drop_rows','mean','model_basic','model_progressive']]
# get index of strategies
lr_results_df = pd.DataFrame(lr_results)
strategies = lr_results_df['imputation strategy']
# get a boolean array where True => standardized
standardized = lr_results_df['standardized']
st = lambda s: ' standardized' if s else ''
coefs_ = {}
for key, value in enumerate(strategies):
if value == 'drop_cols':