Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Last active August 29, 2015 14:03
Show Gist options
  • Save chyikwei/2301f43a07fd60501f82 to your computer and use it in GitHub Desktop.
Save chyikwei/2301f43a07fd60501f82 to your computer and use it in GitHub Desktop.
kdd script
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import cross_validation
# models
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize
# files required
#######################################################################
outcomes_file_name = 'raw_data/outcomes.csv'
sample_submission_file_name = 'raw_data/sampleSubmission.csv'
#categpry_file_name = 'project_category_features_binary_na_filled.csv'
categpry_file_name = 'project_category_features_binary.csv'
main_file_name = 'Allfeatures.csv'
submit_file_name = 'predictions_0715.csv'
########################################################################
exclude_fields = ['projectid', 'teacher_acctid', 'schoolid', 'date_posted', 'resource_types']
def build_matrix(start_date):
outcomes = pd.read_csv(outcomes_file_name)
cat_features = pd.read_csv(categpry_file_name)
main_file = pd.read_csv(main_file_name)
main_file = main_file.fillna(0.0)
all_df = pd.merge(main_file, cat_features, on='projectid')
train_df = all_df[all_df['date_posted'] < '2014-01-01']
train_df = pd.merge(train_df, outcomes, on='projectid')
test_df = all_df[all_df['date_posted'] >= '2014-01-01']
part_train_df = train_df[train_df['date_posted'] >= start_date]
train_response = part_train_df['is_exciting'].apply(lambda x: 1 if x == 't' else 0).values
# remove outcome fileds
part_train_df = part_train_df.sort('projectid')
part_train_df = part_train_df.loc[:, test_df.columns]
test_df = test_df.sort('projectid')
for df in [part_train_df, test_df]:
for field in exclude_fields:
del df[field]
train_X = np.array(part_train_df)
test_X = np.array(test_df)
return train_X, train_response, test_X
def main():
print "load file...."
full_train_X, full_train_y, test_X = build_matrix(start_date='2013-07-01')
# split
X_train, X_test, y_train, y_test = cross_validation.train_test_split(full_train_X, full_train_y, test_size=0.2)
# pick a model an change it parameter....
#lr = linear_model.LogisticRegression(class_weight={1: 1, 0: 1}, C=0.1)
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=4, min_samples_split=5)
#rf = RandomForestClassifier(n_estimators=200, max_depth=8, min_samples_split=15)
clf = gbc
# run model and get train/test AUC
clf.fit(X_train, y_train)
train_preds = clf.predict_proba(X_train)[:,1]
train_auc = metrics.roc_auc_score(y_train, train_preds)
test_preds = clf.predict_proba(X_test)[:,1]
test_auc = metrics.roc_auc_score(y_test, test_preds)
print 'AUC train:%.4f, test:%.4f' % (train_auc, test_auc)
# run model on full_train_X & generate predction on test
clf.fit(full_train_X, full_train_y)
test_preds = clf.predict_proba(test_X)[:,1]
sample = pd.read_csv(sample_submission_file_name)
sample = sample.sort('projectid')
sample['is_exciting'] = test_preds
sample.to_csv(submit_file_name, index = False)
print "submission file generated: %s" % submit_file_name
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment