Last active
August 29, 2015 13:57
-
-
Save jeongyoonlee/9574040 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import argparse | |
| import numpy as np | |
| import pandas as pd | |
| import scipy.sparse as sparse | |
| from sklearn import cross_validation, metrics | |
| from sklearn.datasets import dump_svmlight_file | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import StandardScaler | |
| import time | |
| from const import SEC_PER_MIN | |
| from logger import log | |
| from utility import normalize_numerical_feature2 | |
| CONST_COL = ['f33', 'f34', 'f35', 'f37', 'f38', 'f700', 'f701', 'f702', | |
| 'f736', 'f764'] | |
| def generate_feature(train_file, test_file, train_feature_file, | |
| test_feature_file): | |
| log.info('loading training and test data files.') | |
| trn = pd.read_csv(train_file) | |
| tst = pd.read_csv(test_file) | |
| # set the target variable, loss | |
| trn['loss'] = np.log1p(trn['loss']) | |
| tst['loss'] = 0 | |
| trn['time'] = trn.index | |
| tst['time'] = len(trn) | |
| # combine train and test data | |
| n_trn = len(trn) | |
| df = pd.concat([trn, tst], ignore_index=True) | |
| log.info('dropping constant columns') | |
| df = df.drop(CONST_COL, 1) | |
| log.info('filling missing values with median values.') | |
| df = df.fillna(df.median()) | |
| # 2 golden features out of 4 | |
| df['f274'] = df['f528'] - df['f274'] | |
| df['f527'] = df['f528'] - df['f527'] | |
| # rearrange variables to be the target variable followed by numeric | |
| # features, binary features, and categorical features | |
| columns = [x for x in df.columns] | |
| df = df[['loss', 'time'] + [columns[1]] + [columns[3]] + columns[7:-5] + | |
| columns[757:759]] | |
| # transform numerical variables into normal distributions | |
| for col in df.columns[1:-2]: | |
| log.info('transform feature {} into normal distribution'.format(col)) | |
| df[col] = normalize_numerical_feature2(df[col], n_trn) | |
| # convert the feature data frame into sparse matrix except categorical | |
| # variables. | |
| y = np.array(df['loss']) | |
| # run forward feature selection | |
| selected_features = ['f271', 'f274', 'f527', 'f528'] | |
| features_to_test = [x for x in df.columns[1:] | |
| if x not in selected_features] | |
| # create index for 10-fold cross validation | |
| n_fold = 5 | |
| n_sub = 20000 | |
| cv = cross_validation.KFold(n_sub, n_folds=n_fold, indices=True, | |
| shuffle=True, random_state=1) | |
| yb = y[:n_trn].copy() | |
| yb[yb > 0] = 1 | |
| yb = yb[-n_sub:] | |
| auc_cv_old = .5 | |
| is_improving = True | |
| while is_improving: | |
| auc_cvs = [] | |
| for feature in features_to_test: | |
| log.info('{}'.format(selected_features + [feature])) | |
| X = np.array(df[selected_features + [feature]])[:n_trn] | |
| X = X[-n_sub:] | |
| auc_cv = 0. | |
| for i, (i_trn, i_val) in enumerate(cv, start=1): | |
| clf = LogisticRegression(C=1, class_weight='auto', | |
| random_state=2014) | |
| clf.fit(X[i_trn], yb[i_trn]) | |
| yhat = clf.predict_proba(X[i_val])[:, 1] | |
| auc = metrics.roc_auc_score(yb[i_val], yhat) | |
| auc_cv += auc / n_fold | |
| log.info('AUC CV: {}'.format(auc_cv)) | |
| auc_cvs.append(auc_cv) | |
| auc_cv_new = max(auc_cvs) | |
| if auc_cv_new > auc_cv_old: | |
| auc_cv_old = auc_cv_new | |
| feature = features_to_test.pop(auc_cvs.index(auc_cv_new)) | |
| selected_features.append(feature) | |
| log.info('selected features: {}'.format(selected_features)) | |
| with open('selected_features.txt', 'w') as f: | |
| f.write('{}\n'.format('\n'.join(selected_features))) | |
| else: | |
| is_improving = False | |
| log.info('final selected features: {}'.format(selected_features)) | |
| log.info('saving selected feature names as a file') | |
| with open('selected_features.txt', 'w') as f: | |
| f.write('{}\n'.format('\n'.join(selected_features))) | |
| X = sparse.csr_matrix(df[selected_features]) | |
| log.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1])) | |
| log.info('saving features for training data') | |
| dump_svmlight_file(X[:n_trn], y[:n_trn], train_feature_file, | |
| zero_based=False) | |
| log.info("saving features for test data") | |
| dump_svmlight_file(X[n_trn:], y[n_trn:], test_feature_file, | |
| zero_based=False) | |
| if __name__=="__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--train-file', required=True, dest='train') | |
| parser.add_argument('--test-file', required=True, dest='test') | |
| parser.add_argument('--train-feature-file', required=True, | |
| dest='train_feature') | |
| parser.add_argument('--test-feature-file', required=True, | |
| dest='test_feature') | |
| args = parser.parse_args() | |
| start = time.time() | |
| generate_feature(train_file=args.train, | |
| test_file=args.test, | |
| train_feature_file=args.train_feature, | |
| test_feature_file=args.test_feature) | |
| log.info('finished ({:.2f} min elasped).'.format((time.time() - start) / | |
| SEC_PER_MIN)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment