Skip to content

Instantly share code, notes, and snippets.

@jeongyoonlee
Last active August 29, 2015 13:57
Show Gist options
  • Select an option

  • Save jeongyoonlee/9574040 to your computer and use it in GitHub Desktop.

Select an option

Save jeongyoonlee/9574040 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from sklearn import cross_validation, metrics
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
from const import SEC_PER_MIN
from logger import log
from utility import normalize_numerical_feature2
CONST_COL = ['f33', 'f34', 'f35', 'f37', 'f38', 'f700', 'f701', 'f702',
'f736', 'f764']
def generate_feature(train_file, test_file, train_feature_file,
test_feature_file):
log.info('loading training and test data files.')
trn = pd.read_csv(train_file)
tst = pd.read_csv(test_file)
# set the target variable, loss
trn['loss'] = np.log1p(trn['loss'])
tst['loss'] = 0
trn['time'] = trn.index
tst['time'] = len(trn)
# combine train and test data
n_trn = len(trn)
df = pd.concat([trn, tst], ignore_index=True)
log.info('dropping constant columns')
df = df.drop(CONST_COL, 1)
log.info('filling missing values with median values.')
df = df.fillna(df.median())
# 2 golden features out of 4
df['f274'] = df['f528'] - df['f274']
df['f527'] = df['f528'] - df['f527']
# rearrange variables to be the target variable followed by numeric
# features, binary features, and categorical features
columns = [x for x in df.columns]
df = df[['loss', 'time'] + [columns[1]] + [columns[3]] + columns[7:-5] +
columns[757:759]]
# transform numerical variables into normal distributions
for col in df.columns[1:-2]:
log.info('transform feature {} into normal distribution'.format(col))
df[col] = normalize_numerical_feature2(df[col], n_trn)
# convert the feature data frame into sparse matrix except categorical
# variables.
y = np.array(df['loss'])
# run forward feature selection
selected_features = ['f271', 'f274', 'f527', 'f528']
features_to_test = [x for x in df.columns[1:]
if x not in selected_features]
# create index for 10-fold cross validation
n_fold = 5
n_sub = 20000
cv = cross_validation.KFold(n_sub, n_folds=n_fold, indices=True,
shuffle=True, random_state=1)
yb = y[:n_trn].copy()
yb[yb > 0] = 1
yb = yb[-n_sub:]
auc_cv_old = .5
is_improving = True
while is_improving:
auc_cvs = []
for feature in features_to_test:
log.info('{}'.format(selected_features + [feature]))
X = np.array(df[selected_features + [feature]])[:n_trn]
X = X[-n_sub:]
auc_cv = 0.
for i, (i_trn, i_val) in enumerate(cv, start=1):
clf = LogisticRegression(C=1, class_weight='auto',
random_state=2014)
clf.fit(X[i_trn], yb[i_trn])
yhat = clf.predict_proba(X[i_val])[:, 1]
auc = metrics.roc_auc_score(yb[i_val], yhat)
auc_cv += auc / n_fold
log.info('AUC CV: {}'.format(auc_cv))
auc_cvs.append(auc_cv)
auc_cv_new = max(auc_cvs)
if auc_cv_new > auc_cv_old:
auc_cv_old = auc_cv_new
feature = features_to_test.pop(auc_cvs.index(auc_cv_new))
selected_features.append(feature)
log.info('selected features: {}'.format(selected_features))
with open('selected_features.txt', 'w') as f:
f.write('{}\n'.format('\n'.join(selected_features)))
else:
is_improving = False
log.info('final selected features: {}'.format(selected_features))
log.info('saving selected feature names as a file')
with open('selected_features.txt', 'w') as f:
f.write('{}\n'.format('\n'.join(selected_features)))
X = sparse.csr_matrix(df[selected_features])
log.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1]))
log.info('saving features for training data')
dump_svmlight_file(X[:n_trn], y[:n_trn], train_feature_file,
zero_based=False)
log.info("saving features for test data")
dump_svmlight_file(X[n_trn:], y[n_trn:], test_feature_file,
zero_based=False)
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--train-file', required=True, dest='train')
parser.add_argument('--test-file', required=True, dest='test')
parser.add_argument('--train-feature-file', required=True,
dest='train_feature')
parser.add_argument('--test-feature-file', required=True,
dest='test_feature')
args = parser.parse_args()
start = time.time()
generate_feature(train_file=args.train,
test_file=args.test,
train_feature_file=args.train_feature,
test_feature_file=args.test_feature)
log.info('finished ({:.2f} min elasped).'.format((time.time() - start) /
SEC_PER_MIN))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment