Michel Kana michelkana

I dedicate myself to data for the good, bias-less machine learning and fair algorithms.

michelkana / bert_qa_architecture.py

Last active March 20, 2021 10:02

	BertForQuestionAnswering(
	(bert): BertModel(
	(embeddings): BertEmbeddings(
	(word_embeddings): Embedding(30522, 768, padding_idx=0)
	(position_embeddings): Embedding(512, 768)
	(token_type_embeddings): Embedding(2, 768)
	(LayerNorm): BertLayerNorm()
	(dropout): Dropout(p=0.1)
	)
	(encoder): BertEncoder(

michelkana / airbnb_polyreg_R.py

Created June 13, 2020 23:11

	import matplotlib.pyplot as plt
	%matplotlib inline
	from sklearn.metrics import r2_score

	# importing R functions
	#!pip install rpy2
	import rpy2.robjects as robjects
	r_predict = robjects.r["predict"]
	r_lm = robjects.r["lm"]

michelkana / airbnb_polyreg_data.py

Created June 13, 2020 22:41

	# prepare train data
	df_train_summary = df_train_2.groupby(['date']) \
	.agg({'price': np.mean}).reset_index()
	min_date = df_train_summary.date.min()
	df_train_summary.date = df_train_summary.date - min_date
	df_train_summary.date = df_train_summary.date.dt.days
	df_train_summary.sample(frac=.01)

	# prepare test data
	df_test_2 = convert_date(df_test)

michelkana / airbnb_price_per_month.py

Last active May 4, 2022 19:47

	import pandas as pd
	import numpy as np

	# load the data
	df_train = pd.read_csv('calendar_train.csv')
	df_test = pd.read_csv('calendar_test.csv')

	# convert dates
	def convert_date(df):
	df = df[~ df.price.isnull()]

michelkana / missing_data_mnar.py

Last active April 14, 2020 15:28

	## Missing Not at Random (MNAR)

	# randomly mark half of x1 samples as missing MNAR
	# depending on unrecorded predictor x3
	x3 = np.random.uniform(0, 1, 100)
	idx_mnar = x3 > .5

	fig, ax = plt.subplots(1,2,figsize=(15,5))
	ax[0].scatter(x1, y, label='data')
	ax[0].scatter(x1[idx_mnar], y[idx_mnar], label='missing', color='red')

michelkana / missing_data_mar.py

Last active April 14, 2020 15:31

	## Missing at Random (MAR)

	# randomly mark half of x1 samples as missing MAR
	# depending on value of recorded predictor x2

	idx_mar = x2 == 1

	fig, ax = plt.subplots(1,2,figsize=(15,5))
	ax[0].scatter(x1, y, label='data')
	ax[0].scatter(x1[idx_mar], y[idx_mar], label='missing', color='red')

michelkana / missing_data_mcar.py

Created April 14, 2020 13:59

	## Missing Completely at Random (MCAR)

	# randomly mark half of x1 samples as missing MCAR
	# independend of any information recorded
	idx_mcar= np.random.choice([0, 1], size=(100,)) == 1

	plt.scatter(x1,y, label='data')
	plt.scatter(x1[idx_mcar],y[idx_mcar], label='missing (MCAR)', color='red')
	plt.xlabel('x1')
	plt.ylabel('y')

michelkana / missing_data_synthetic.py

Created April 14, 2020 13:21

	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd

	## create a synthetic dataset

	# measured predictors x1, x2
	x1 = np.sort(np.random.uniform(-2.0, 3.0, 100)) # random real numbers
	x2 = np.random.choice([0, 1], size=(100,)) # random binary numbers

michelkana / missing_data_logistic_regression_coeffs2.py

Created April 13, 2020 19:11

	coef_perc_df = coef_df.copy()
	cols = coef_df.columns.difference(['drop_rows']).values
	for col in cols:
	coef_perc_df[col] = np.round(100*(coef_df[col]/coef_df['drop_rows']-1))
	coef_perc_df[['drop_rows','mean','model_basic','model_progressive']]

michelkana / missing_data_logistic_regression_coeffs.py

Last active April 13, 2020 19:10

	# get index of strategies
	lr_results_df = pd.DataFrame(lr_results)
	strategies = lr_results_df['imputation strategy']

	# get a boolean array where True => standardized
	standardized = lr_results_df['standardized']
	st = lambda s: ' standardized' if s else ''
	coefs_ = {}
	for key, value in enumerate(strategies):
	if value == 'drop_cols':