Last active
September 9, 2017 11:43
-
-
Save halegreen/44b2cf3a89ff83dd78ad2ab5efdc471a to your computer and use it in GitHub Desktop.
likelihood encoding implementation, using 2 level CV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import KFold | |
import dill as pickle | |
import sys | |
def input_data(train_file): | |
with open(train_file, 'rb') as f1: | |
train_data = pickle.load(f1) | |
cat_feature = [] | |
for dtype, feature in zip(train_data.dtypes, train_data.columns): | |
if dtype == object: | |
cat_feature.append(feature) | |
print cat_feature | |
return (train_data, cat_feature) | |
def clean_noise(data): | |
MinLogError = -0.4 | |
MaxLogError = 0.418 | |
data = data[(data['logerror'] > MinLogError) & (data['logerror'] < MaxLogError)] | |
return data | |
def likelihood_encoding(data, feature, data_type, target = 'logerror'): | |
''' | |
:param data: | |
:param feature: | |
:param terget: | |
:return: likelihood encoded data | |
''' | |
#data = data[f].values.astype(np.str_, copy=False) | |
np.random.seed(2017) | |
n_folds = 10 | |
n_inner_folds = 5 | |
likelihood_encoded = pd.Series() | |
##global mean, could be tuned later | |
oof_default_mean = data[target].mean() | |
kf = KFold(n_splits=n_folds, shuffle=True) | |
oof_mean_cv = pd.DataFrame() | |
split = 0 | |
print ('raw data shape {}'.format(data.shape)) | |
for infold, oof in kf.split(data[feature]): | |
#print ('infold data shape %s , oof data shape %s' | |
# % (data.iloc[infold].shape, data.iloc[oof].shape)) | |
print ('==============level 1 encoding..., fold %s ============' % split) | |
inner_kf = KFold(n_splits=n_inner_folds, shuffle=True) | |
inner_oof_default_mean = data.iloc[infold][target].mean() | |
inner_split = 0 | |
## inner out of fold mean, used for outer oof | |
inner_oof_mean_cv = pd.DataFrame() | |
## | |
likelihood_encoded_cv = pd.Series() | |
for inner_infold, inner_oof in inner_kf.split(data.iloc[infold]): | |
#print ('innner infold data shape %s , inner oof data shape %s' | |
# % (data.iloc[inner_infold].shape, data.iloc[inner_oof].shape)) | |
print ('==============level 2 encoding..., inner fold %s ============' % inner_split) | |
## inner out of fold mean | |
oof_mean = data.iloc[inner_infold].groupby(by=feature)[target].mean() | |
# assign oof_mean to the in fold | |
likelihood_encoded_cv = likelihood_encoded_cv.append(data.iloc[infold].apply( | |
lambda x : oof_mean[x[feature]] | |
if x[feature] in oof_mean.index | |
else inner_oof_default_mean | |
, axis = 1 | |
)) | |
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer') | |
inner_oof_mean_cv.fillna(inner_oof_default_mean, inplace=True) | |
inner_split += 1 | |
#print inner_oof_mean_cv.head() | |
#print inner_oof_mean_cv.index | |
#sys.exit(1) | |
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer') | |
oof_mean_cv.fillna(value=oof_default_mean, inplace=True) | |
split += 1 | |
print ('============final mapping...===========') | |
#print type(data.iloc[1][feature][0]), ' ', data.iloc[1][feature][0] | |
# print data.iloc[oof][feature] | |
# print(np.mean(inner_oof_mean_cv.ix['6037'].values)) | |
#sys.exit(1) | |
likelihood_encoded = likelihood_encoded.append(data.iloc[oof].apply( | |
lambda x: np.mean(inner_oof_mean_cv.ix[x[feature]].values) | |
if x[feature] in inner_oof_mean_cv.index | |
else oof_default_mean | |
, axis=1 | |
)) | |
return (likelihood_encoded, oof_mean_cv.mean(axis = 1), oof_default_mean) | |
if __name__ == '__main__': | |
i_file = 'train.pkl' | |
train_data, cat_feature = input_data(i_file) | |
clean_noise(train_data) | |
likelihood_coding_map = {} | |
debug_cat_feature = ['fipsid', 'tractid'] | |
# debug_cat_feature = ['blockid'] | |
for f in debug_cat_feature: | |
data_type = False | |
print ('Likelihood coding for {}'.format(f)) | |
if type(train_data.loc[0][f]) == float: | |
data_type = True | |
train_data[f], likelihood_coding_mapping, default_coding = likelihood_encoding(train_data, f, data_type) | |
#likelihood_coding_map[f] = (likelihood_coding_mapping, default_coding) | |
# mapping, default_mean = likelihood_coding_map[f] | |
# test_data[f] = test_data.apply(lambda x : mapping[x[f]] | |
# if x[f] in mapping | |
# else default_mean | |
# ,axis = 1) | |
print train_data.head() | |
print '=============================================' | |
print train_data['fipsid'].value_counts() | |
with open('encoded_train.pkl', 'wb') as f1: | |
pickle.dump(train_data, f1, -1) | |
f1.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment