Skip to content

Instantly share code, notes, and snippets.

@halegreen
Last active September 9, 2017 11:43
Show Gist options
  • Save halegreen/44b2cf3a89ff83dd78ad2ab5efdc471a to your computer and use it in GitHub Desktop.
Save halegreen/44b2cf3a89ff83dd78ad2ab5efdc471a to your computer and use it in GitHub Desktop.
likelihood encoding implementation, using 2 level CV
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import dill as pickle
import sys
def input_data(train_file):
with open(train_file, 'rb') as f1:
train_data = pickle.load(f1)
cat_feature = []
for dtype, feature in zip(train_data.dtypes, train_data.columns):
if dtype == object:
cat_feature.append(feature)
print cat_feature
return (train_data, cat_feature)
def clean_noise(data):
MinLogError = -0.4
MaxLogError = 0.418
data = data[(data['logerror'] > MinLogError) & (data['logerror'] < MaxLogError)]
return data
def likelihood_encoding(data, feature, data_type, target = 'logerror'):
'''
:param data:
:param feature:
:param terget:
:return: likelihood encoded data
'''
#data = data[f].values.astype(np.str_, copy=False)
np.random.seed(2017)
n_folds = 10
n_inner_folds = 5
likelihood_encoded = pd.Series()
##global mean, could be tuned later
oof_default_mean = data[target].mean()
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
print ('raw data shape {}'.format(data.shape))
for infold, oof in kf.split(data[feature]):
#print ('infold data shape %s , oof data shape %s'
# % (data.iloc[infold].shape, data.iloc[oof].shape))
print ('==============level 1 encoding..., fold %s ============' % split)
inner_kf = KFold(n_splits=n_inner_folds, shuffle=True)
inner_oof_default_mean = data.iloc[infold][target].mean()
inner_split = 0
## inner out of fold mean, used for outer oof
inner_oof_mean_cv = pd.DataFrame()
##
likelihood_encoded_cv = pd.Series()
for inner_infold, inner_oof in inner_kf.split(data.iloc[infold]):
#print ('innner infold data shape %s , inner oof data shape %s'
# % (data.iloc[inner_infold].shape, data.iloc[inner_oof].shape))
print ('==============level 2 encoding..., inner fold %s ============' % inner_split)
## inner out of fold mean
oof_mean = data.iloc[inner_infold].groupby(by=feature)[target].mean()
# assign oof_mean to the in fold
likelihood_encoded_cv = likelihood_encoded_cv.append(data.iloc[infold].apply(
lambda x : oof_mean[x[feature]]
if x[feature] in oof_mean.index
else inner_oof_default_mean
, axis = 1
))
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(inner_oof_default_mean, inplace=True)
inner_split += 1
#print inner_oof_mean_cv.head()
#print inner_oof_mean_cv.index
#sys.exit(1)
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
print ('============final mapping...===========')
#print type(data.iloc[1][feature][0]), ' ', data.iloc[1][feature][0]
# print data.iloc[oof][feature]
# print(np.mean(inner_oof_mean_cv.ix['6037'].values))
#sys.exit(1)
likelihood_encoded = likelihood_encoded.append(data.iloc[oof].apply(
lambda x: np.mean(inner_oof_mean_cv.ix[x[feature]].values)
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1
))
return (likelihood_encoded, oof_mean_cv.mean(axis = 1), oof_default_mean)
if __name__ == '__main__':
i_file = 'train.pkl'
train_data, cat_feature = input_data(i_file)
clean_noise(train_data)
likelihood_coding_map = {}
debug_cat_feature = ['fipsid', 'tractid']
# debug_cat_feature = ['blockid']
for f in debug_cat_feature:
data_type = False
print ('Likelihood coding for {}'.format(f))
if type(train_data.loc[0][f]) == float:
data_type = True
train_data[f], likelihood_coding_mapping, default_coding = likelihood_encoding(train_data, f, data_type)
#likelihood_coding_map[f] = (likelihood_coding_mapping, default_coding)
# mapping, default_mean = likelihood_coding_map[f]
# test_data[f] = test_data.apply(lambda x : mapping[x[f]]
# if x[f] in mapping
# else default_mean
# ,axis = 1)
print train_data.head()
print '============================================='
print train_data['fipsid'].value_counts()
with open('encoded_train.pkl', 'wb') as f1:
pickle.dump(train_data, f1, -1)
f1.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment