Created
October 7, 2019 05:43
-
-
Save steermomo/20808f0d37ed9dbf85e4cb7e411c9cc7 to your computer and use it in GitHub Desktop.
feature engineering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600 | |
# Encoding Functions | |
# Below are 5 encoding functions. | |
# (1) encode_FE does frequency encoding where it combines train and test first and then encodes. | |
# (2) encode_LE is a label encoded for categorical features | |
# (3) encode_AG makes aggregated features such as aggregated mean and std | |
# (4) encode_CB combines two columns | |
# (5) encode_AG2 makes aggregated features where it counts how many unique values of one feature is within a group. | |
# For more explanation about feature engineering, see the discussion here | |
# FREQUENCY ENCODE TOGETHER | |
def encode_FE(df1, df2, cols): | |
for col in cols: | |
df = pd.concat([df1[col],df2[col]]) | |
vc = df.value_counts(dropna=True, normalize=True).to_dict() | |
vc[-1] = -1 | |
nm = col+'_FE' | |
df1[nm] = df1[col].map(vc) | |
df1[nm] = df1[nm].astype('float32') | |
df2[nm] = df2[col].map(vc) | |
df2[nm] = df2[nm].astype('float32') | |
print(nm,', ',end='') | |
# LABEL ENCODE | |
def encode_LE(col,train=X_train,test=X_test,verbose=True): | |
df_comb = pd.concat([train[col],test[col]],axis=0) | |
df_comb,_ = df_comb.factorize(sort=True) | |
nm = col | |
if df_comb.max()>32000: | |
train[nm] = df_comb[:len(train)].astype('int32') | |
test[nm] = df_comb[len(train):].astype('int32') | |
else: | |
train[nm] = df_comb[:len(train)].astype('int16') | |
test[nm] = df_comb[len(train):].astype('int16') | |
del df_comb; x=gc.collect() | |
if verbose: print(nm,', ',end='') | |
# GROUP AGGREGATION MEAN AND STD | |
# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda | |
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=X_train, test_df=X_test, | |
fillna=True, usena=False): | |
# AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS | |
for main_column in main_columns: | |
for col in uids: | |
for agg_type in aggregations: | |
new_col_name = main_column+'_'+col+'_'+agg_type | |
temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]]) | |
if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan | |
temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename( | |
columns={agg_type: new_col_name}) | |
temp_df.index = list(temp_df[col]) | |
temp_df = temp_df[new_col_name].to_dict() | |
train_df[new_col_name] = train_df[col].map(temp_df).astype('float32') | |
test_df[new_col_name] = test_df[col].map(temp_df).astype('float32') | |
if fillna: | |
train_df[new_col_name].fillna(-1,inplace=True) | |
test_df[new_col_name].fillna(-1,inplace=True) | |
print("'"+new_col_name+"'",', ',end='') | |
# COMBINE FEATURES | |
def encode_CB(col1,col2,df1=X_train,df2=X_test): | |
nm = col1+'_'+col2 | |
df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str) | |
df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) | |
encode_LE(nm,verbose=False) | |
print(nm,', ',end='') | |
# GROUP AGGREGATION NUNIQUE | |
def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test): | |
for main_column in main_columns: | |
for col in uids: | |
comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0) | |
mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict() | |
train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32') | |
test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32') | |
print(col+'_'+main_column+'_ct, ',end='') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment