Skip to content

Instantly share code, notes, and snippets.

@nlpjoe
Last active August 16, 2018 08:20
Show Gist options
  • Select an option

  • Save nlpjoe/38957f59cd5e6c6f8de7040a3bb9c486 to your computer and use it in GitHub Desktop.

Select an option

Save nlpjoe/38957f59cd5e6c6f8de7040a3bb9c486 to your computer and use it in GitHub Desktop.
[Kaggle utils] #python
"""Kaggle competition: Predicting a Biological Response.
Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
[0,1]. The blending scheme is related to the idea Jose H. Solorzano
presented here:
http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
'''You can try this: In one of the 5 folds, train the models, then use
the results of the models as 'variables' in logistic regression over
the validation data of that fold'''. Or at least this is the
implementation of my understanding of that idea :-)
The predictions are saved in test.csv. The code below created my best
submission to the competition:
- public score (25%): 0.43464
- private score (75%): 0.37751
- final rank on the private leaderboard: 17th over 711 teams :-)
Note: if you increase the number of estimators of the classifiers,
e.g. n_estimators=1000, you get a better score/rank on the private
test set.
Copyright 2012, Emanuele Olivetti.
BSD license, 3 clauses.
"""
from __future__ import division
import numpy as np
import load_data
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
def logloss(attempt, actual, epsilon=1.0e-15):
"""Logloss, i.e. the score of the bioresponse competition.
"""
attempt = np.clip(attempt, epsilon, 1.0-epsilon)
return - np.mean(actual * np.log(attempt) +
(1.0 - actual) * np.log(1.0 - attempt))
if __name__ == '__main__':
np.random.seed(0) # seed to shuffle the train set
n_folds = 10
verbose = True
shuffle = False
X, y, X_submission = load_data.load()
if shuffle:
idx = np.random.permutation(y.size)
X = X[idx]
y = y[idx]
skf = list(StratifiedKFold(y, n_folds))
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print j, clf
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
clf.fit(X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
print
print "Blending."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print "Saving Results."
tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
header='MoleculeId,PredictedProbability', comments='')
from sklearn import cross_validation
from sklearn.metrics import log_loss, accuracy_score
import numpy as np
import pandas as pd
import random
import md5
import json
def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="",
save_test_only="", seed=300373, save_params="",
clf_name="XX", generalizers_params=[], minimal_loss=0,
return_score=False, minimizer="log_loss"):
print("\nBlending with classifier:\n\t{}".format(clf))
folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
print(X_train.shape)
dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))
#iterate through train set and train - predict folds
loss = 0
for i, (train_index, test_index) in enumerate( folds ):
print("Train Fold {}/{}}".format(i+1,nfolds))
fold_X_train = X_train[train_index]
fold_y_train = y[train_index]
fold_X_test = X_train[test_index]
fold_y_test = y[test_index]
clf.fit(fold_X_train, fold_y_train)
fold_preds = clf.predict_proba(fold_X_test)
print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds)))
dataset_blend_train[test_index] = fold_preds
if minimizer == "log_loss":
loss += log_loss(fold_y_test,fold_preds)
if minimizer == "accuracy":
fold_preds_a = np.argmax(fold_preds, axis=1)
loss += accuracy_score(fold_y_test,fold_preds_a)
#fold_preds = clf.predict(fold_X_test)
#loss += accuracy_score(fold_y_test,fold_preds)
if minimal_loss > 0 and loss > minimal_loss and i == 0:
return False, False
fold_preds = np.argmax(fold_preds, axis=1)
print("Accuracy: {}".format(accuracy_score(fold_y_test,fold_preds)))
avg_loss = loss / float(i+1)
print("\nAverage:\t{}\n".format(avg_loss))
#predict test set (better to take average on all folds, but this is quicker)
print("Test Fold 1/1")
clf.fit(X_train, y)
dataset_blend_test = clf.predict_proba(X_test)
if clf_name == "XX":
clf_name = str(clf)[1:3]
if len(save_preds)>0:
id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
print("storing meta predictions at: {}"{}ave_preds)
np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train))
np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test))
if len(save_test_only)>0:
id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
print("storing meta predictions at: {}"{}ave_test_only)
dataset_blend_test = clf.predict(X_test)
np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test))
d = {}
d["stacker"] = clf.get_params()
d["generalizers"] = generalizers_params
with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f:
json.dump(d, f)
if len(save_params)>0:
id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
d = {}
d["name"] = clf_name
d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
d["generalizers"] = generalizers_params
with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f:
json.dump(d, f)
if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
if return_score:
return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
else:
return dataset_blend_train[:,1], dataset_blend_test[:,1]
else:
if return_score:
return dataset_blend_train, dataset_blend_test, avg_loss
else:
return dataset_blend_train, dataset_blend_test
import pandas as pd
import sys
first_file = sys.argv[1]
second_file = sys.argv[2]
def corr(first_file, second_file):
first_df = pd.read_csv(first_file,index_col=0)
second_df = pd.read_csv(second_file,index_col=0)
# assuming first column is `prediction_id` and second column is `prediction`
prediction = first_df.columns[0]
# correlation
print(first_df[prediction])
print("Finding correlation between: {} and {}".format(first_file,second_file))
print("Column to be measured: {}".format(prediction))
print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson')))
print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall')))
print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman')))
corr(first_file, second_file)
from collections import defaultdict
from glob import glob
import sys
glob_files = sys.argv[1]
loc_outfile = sys.argv[2]
def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
if method == "average":
scores = defaultdict(float)
with open(loc_outfile,"w") as outfile:
for i, glob_file in enumerate( glob(glob_files) ):
print("parsing: {}".format(glob_file))
# sort glob_file by first column, ignoring the first line
lines = open(glob_file).readlines()
lines = [lines[0]] + sorted(lines[1:])
for e, line in enumerate( lines ):
if i == 0 and e == 0:
outfile.write(line)
if e > 0:
row = line.strip().split(",")
scores[(e,row[0])] += float(row[1])
for j,k in sorted(scores):
outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1)))
print("wrote to {}".format(loc_outfile))
kaggle_bag(glob_files, loc_outfile)
from __future__ import division
from collections import defaultdict
from glob import glob
import sys
import math
glob_files = sys.argv[1]
loc_outfile = sys.argv[2]
def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
if method == "average":
scores = defaultdict(float)
with open(loc_outfile,"w") as outfile:
for i, glob_file in enumerate( glob(glob_files) ):
print("parsing: {}".format(glob_file))
# sort glob_file by first column, ignoring the first line
lines = open(glob_file).readlines()
lines = [lines[0]] + sorted(lines[1:])
for e, line in enumerate( lines ):
if i == 0 and e == 0:
outfile.write(line)
if e > 0:
row = line.strip().split(",")
if scores[(e,row[0])] == 0:
scores[(e,row[0])] = 1
scores[(e,row[0])] *= float(row[1])
for j,k in sorted(scores):
outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1))))
print("wrote to {}".format(loc_outfile))
kaggle_bag(glob_files, loc_outfile)
from __future__ import division
from collections import defaultdict
from glob import glob
import sys
glob_files = sys.argv[1]
loc_outfile = sys.argv[2]
def kaggle_bag(glob_files, loc_outfile):
with open(loc_outfile,"w") as outfile:
all_ranks = defaultdict(list)
for i, glob_file in enumerate( glob(glob_files) ):
file_ranks = []
print("parsing: {}".format(glob_file))
# sort glob_file by first column, ignoring the first line
lines = open(glob_file).readlines()
lines = [lines[0]] + sorted(lines[1:])
for e, line in enumerate( lines ):
if e == 0 and i == 0:
outfile.write( line )
elif e > 0:
r = line.strip().split(",")
file_ranks.append( (float(r[1]), e, r[0]) )
for rank, item in enumerate( sorted(file_ranks) ):
all_ranks[(item[1],item[2])].append(rank)
average_ranks = []
for k in sorted(all_ranks):
average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))
ranked_ranks = []
for rank, k in enumerate(sorted(average_ranks)):
ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1)))
for k in sorted(ranked_ranks):
outfile.write("%s,%s\n"%(k[1],k[2]))
print("wrote to {}".format(loc_outfile))
kaggle_bag(glob_files, loc_outfile)
from collections import defaultdict, Counter
from glob import glob
import sys
import re
glob_files = sys.argv[1]
loc_outfile = sys.argv[2]
weights_strategy = "uniform"
if len(sys.argv) == 4:
weights_strategy = sys.argv[3]
def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
pattern = re.compile(r"(.)*_[w|W](\d*)_[.]*")
if method == "average":
scores = defaultdict(list)
with open(loc_outfile,"w") as outfile:
#weight_list may be usefull using a different method
weight_list = [1]*len(glob(glob_files))
for i, glob_file in enumerate( glob(glob_files) ):
print("parsing: {}".format(glob_file))
if weights == "weighted":
weight = pattern.match(glob_file)
if weight and weight.group(2):
print("Using weight: {}".format(weight.group(2)))
weight_list[i] = weight_list[i]*int(weight.group(2))
else:
print("Using weight: 1")
# sort glob_file by first column, ignoring the first line
lines = open(glob_file).readlines()
lines = [lines[0]] + sorted(lines[1:])
for e, line in enumerate( lines ):
if i == 0 and e == 0:
outfile.write(line)
if e > 0:
row = line.strip().split(",")
for l in range(1,weight_list[i]+1):
scores[(e,row[0])].append(row[1])
for j,k in sorted(scores):
outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0]))
print("wrote to {}".format(loc_outfile))
kaggle_bag(glob_files, loc_outfile, weights=weights_strategy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment