Last active
August 16, 2018 08:20
-
-
Save nlpjoe/38957f59cd5e6c6f8de7040a3bb9c486 to your computer and use it in GitHub Desktop.
[Kaggle utils] #python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Kaggle competition: Predicting a Biological Response. | |
| Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to | |
| [0,1]. The blending scheme is related to the idea Jose H. Solorzano | |
| presented here: | |
| http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950 | |
| '''You can try this: In one of the 5 folds, train the models, then use | |
| the results of the models as 'variables' in logistic regression over | |
| the validation data of that fold'''. Or at least this is the | |
| implementation of my understanding of that idea :-) | |
| The predictions are saved in test.csv. The code below created my best | |
| submission to the competition: | |
| - public score (25%): 0.43464 | |
| - private score (75%): 0.37751 | |
| - final rank on the private leaderboard: 17th over 711 teams :-) | |
| Note: if you increase the number of estimators of the classifiers, | |
| e.g. n_estimators=1000, you get a better score/rank on the private | |
| test set. | |
| Copyright 2012, Emanuele Olivetti. | |
| BSD license, 3 clauses. | |
| """ | |
| from __future__ import division | |
| import numpy as np | |
| import load_data | |
| from sklearn.cross_validation import StratifiedKFold | |
| from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier | |
| from sklearn.ensemble import GradientBoostingClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| def logloss(attempt, actual, epsilon=1.0e-15): | |
| """Logloss, i.e. the score of the bioresponse competition. | |
| """ | |
| attempt = np.clip(attempt, epsilon, 1.0-epsilon) | |
| return - np.mean(actual * np.log(attempt) + | |
| (1.0 - actual) * np.log(1.0 - attempt)) | |
| if __name__ == '__main__': | |
| np.random.seed(0) # seed to shuffle the train set | |
| n_folds = 10 | |
| verbose = True | |
| shuffle = False | |
| X, y, X_submission = load_data.load() | |
| if shuffle: | |
| idx = np.random.permutation(y.size) | |
| X = X[idx] | |
| y = y[idx] | |
| skf = list(StratifiedKFold(y, n_folds)) | |
| clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), | |
| RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), | |
| ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), | |
| ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), | |
| GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)] | |
| print "Creating train and test sets for blending." | |
| dataset_blend_train = np.zeros((X.shape[0], len(clfs))) | |
| dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) | |
| for j, clf in enumerate(clfs): | |
| print j, clf | |
| dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) | |
| for i, (train, test) in enumerate(skf): | |
| print "Fold", i | |
| X_train = X[train] | |
| y_train = y[train] | |
| X_test = X[test] | |
| y_test = y[test] | |
| clf.fit(X_train, y_train) | |
| y_submission = clf.predict_proba(X_test)[:, 1] | |
| dataset_blend_train[test, j] = y_submission | |
| dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] | |
| dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) | |
| print "Blending." | |
| clf = LogisticRegression() | |
| clf.fit(dataset_blend_train, y) | |
| y_submission = clf.predict_proba(dataset_blend_test)[:, 1] | |
| print "Linear stretch of predictions to [0,1]" | |
| y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) | |
| print "Saving Results." | |
| tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T | |
| np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f', | |
| header='MoleculeId,PredictedProbability', comments='') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn import cross_validation | |
| from sklearn.metrics import log_loss, accuracy_score | |
| import numpy as np | |
| import pandas as pd | |
| import random | |
| import md5 | |
| import json | |
| def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="", | |
| save_test_only="", seed=300373, save_params="", | |
| clf_name="XX", generalizers_params=[], minimal_loss=0, | |
| return_score=False, minimizer="log_loss"): | |
| print("\nBlending with classifier:\n\t{}".format(clf)) | |
| folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed)) | |
| print(X_train.shape) | |
| dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0])) | |
| #iterate through train set and train - predict folds | |
| loss = 0 | |
| for i, (train_index, test_index) in enumerate( folds ): | |
| print("Train Fold {}/{}}".format(i+1,nfolds)) | |
| fold_X_train = X_train[train_index] | |
| fold_y_train = y[train_index] | |
| fold_X_test = X_train[test_index] | |
| fold_y_test = y[test_index] | |
| clf.fit(fold_X_train, fold_y_train) | |
| fold_preds = clf.predict_proba(fold_X_test) | |
| print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds))) | |
| dataset_blend_train[test_index] = fold_preds | |
| if minimizer == "log_loss": | |
| loss += log_loss(fold_y_test,fold_preds) | |
| if minimizer == "accuracy": | |
| fold_preds_a = np.argmax(fold_preds, axis=1) | |
| loss += accuracy_score(fold_y_test,fold_preds_a) | |
| #fold_preds = clf.predict(fold_X_test) | |
| #loss += accuracy_score(fold_y_test,fold_preds) | |
| if minimal_loss > 0 and loss > minimal_loss and i == 0: | |
| return False, False | |
| fold_preds = np.argmax(fold_preds, axis=1) | |
| print("Accuracy: {}".format(accuracy_score(fold_y_test,fold_preds))) | |
| avg_loss = loss / float(i+1) | |
| print("\nAverage:\t{}\n".format(avg_loss)) | |
| #predict test set (better to take average on all folds, but this is quicker) | |
| print("Test Fold 1/1") | |
| clf.fit(X_train, y) | |
| dataset_blend_test = clf.predict_proba(X_test) | |
| if clf_name == "XX": | |
| clf_name = str(clf)[1:3] | |
| if len(save_preds)>0: | |
| id = md5.new("{}"{}tr(clf.get_params())).hexdigest() | |
| print("storing meta predictions at: {}"{}ave_preds) | |
| np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train)) | |
| np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test)) | |
| if len(save_test_only)>0: | |
| id = md5.new("{}"{}tr(clf.get_params())).hexdigest() | |
| print("storing meta predictions at: {}"{}ave_test_only) | |
| dataset_blend_test = clf.predict(X_test) | |
| np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test)) | |
| d = {} | |
| d["stacker"] = clf.get_params() | |
| d["generalizers"] = generalizers_params | |
| with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f: | |
| json.dump(d, f) | |
| if len(save_params)>0: | |
| id = md5.new("{}"{}tr(clf.get_params())).hexdigest() | |
| d = {} | |
| d["name"] = clf_name | |
| d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()} | |
| d["generalizers"] = generalizers_params | |
| with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f: | |
| json.dump(d, f) | |
| if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba | |
| if return_score: | |
| return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss | |
| else: | |
| return dataset_blend_train[:,1], dataset_blend_test[:,1] | |
| else: | |
| if return_score: | |
| return dataset_blend_train, dataset_blend_test, avg_loss | |
| else: | |
| return dataset_blend_train, dataset_blend_test |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import sys | |
| first_file = sys.argv[1] | |
| second_file = sys.argv[2] | |
| def corr(first_file, second_file): | |
| first_df = pd.read_csv(first_file,index_col=0) | |
| second_df = pd.read_csv(second_file,index_col=0) | |
| # assuming first column is `prediction_id` and second column is `prediction` | |
| prediction = first_df.columns[0] | |
| # correlation | |
| print(first_df[prediction]) | |
| print("Finding correlation between: {} and {}".format(first_file,second_file)) | |
| print("Column to be measured: {}".format(prediction)) | |
| print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson'))) | |
| print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall'))) | |
| print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman'))) | |
| corr(first_file, second_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict | |
| from glob import glob | |
| import sys | |
| glob_files = sys.argv[1] | |
| loc_outfile = sys.argv[2] | |
| def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): | |
| if method == "average": | |
| scores = defaultdict(float) | |
| with open(loc_outfile,"w") as outfile: | |
| for i, glob_file in enumerate( glob(glob_files) ): | |
| print("parsing: {}".format(glob_file)) | |
| # sort glob_file by first column, ignoring the first line | |
| lines = open(glob_file).readlines() | |
| lines = [lines[0]] + sorted(lines[1:]) | |
| for e, line in enumerate( lines ): | |
| if i == 0 and e == 0: | |
| outfile.write(line) | |
| if e > 0: | |
| row = line.strip().split(",") | |
| scores[(e,row[0])] += float(row[1]) | |
| for j,k in sorted(scores): | |
| outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1))) | |
| print("wrote to {}".format(loc_outfile)) | |
| kaggle_bag(glob_files, loc_outfile) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| from collections import defaultdict | |
| from glob import glob | |
| import sys | |
| import math | |
| glob_files = sys.argv[1] | |
| loc_outfile = sys.argv[2] | |
| def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): | |
| if method == "average": | |
| scores = defaultdict(float) | |
| with open(loc_outfile,"w") as outfile: | |
| for i, glob_file in enumerate( glob(glob_files) ): | |
| print("parsing: {}".format(glob_file)) | |
| # sort glob_file by first column, ignoring the first line | |
| lines = open(glob_file).readlines() | |
| lines = [lines[0]] + sorted(lines[1:]) | |
| for e, line in enumerate( lines ): | |
| if i == 0 and e == 0: | |
| outfile.write(line) | |
| if e > 0: | |
| row = line.strip().split(",") | |
| if scores[(e,row[0])] == 0: | |
| scores[(e,row[0])] = 1 | |
| scores[(e,row[0])] *= float(row[1]) | |
| for j,k in sorted(scores): | |
| outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1)))) | |
| print("wrote to {}".format(loc_outfile)) | |
| kaggle_bag(glob_files, loc_outfile) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import division | |
| from collections import defaultdict | |
| from glob import glob | |
| import sys | |
| glob_files = sys.argv[1] | |
| loc_outfile = sys.argv[2] | |
| def kaggle_bag(glob_files, loc_outfile): | |
| with open(loc_outfile,"w") as outfile: | |
| all_ranks = defaultdict(list) | |
| for i, glob_file in enumerate( glob(glob_files) ): | |
| file_ranks = [] | |
| print("parsing: {}".format(glob_file)) | |
| # sort glob_file by first column, ignoring the first line | |
| lines = open(glob_file).readlines() | |
| lines = [lines[0]] + sorted(lines[1:]) | |
| for e, line in enumerate( lines ): | |
| if e == 0 and i == 0: | |
| outfile.write( line ) | |
| elif e > 0: | |
| r = line.strip().split(",") | |
| file_ranks.append( (float(r[1]), e, r[0]) ) | |
| for rank, item in enumerate( sorted(file_ranks) ): | |
| all_ranks[(item[1],item[2])].append(rank) | |
| average_ranks = [] | |
| for k in sorted(all_ranks): | |
| average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k)) | |
| ranked_ranks = [] | |
| for rank, k in enumerate(sorted(average_ranks)): | |
| ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1))) | |
| for k in sorted(ranked_ranks): | |
| outfile.write("%s,%s\n"%(k[1],k[2])) | |
| print("wrote to {}".format(loc_outfile)) | |
| kaggle_bag(glob_files, loc_outfile) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict, Counter | |
| from glob import glob | |
| import sys | |
| import re | |
| glob_files = sys.argv[1] | |
| loc_outfile = sys.argv[2] | |
| weights_strategy = "uniform" | |
| if len(sys.argv) == 4: | |
| weights_strategy = sys.argv[3] | |
| def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): | |
| pattern = re.compile(r"(.)*_[w|W](\d*)_[.]*") | |
| if method == "average": | |
| scores = defaultdict(list) | |
| with open(loc_outfile,"w") as outfile: | |
| #weight_list may be usefull using a different method | |
| weight_list = [1]*len(glob(glob_files)) | |
| for i, glob_file in enumerate( glob(glob_files) ): | |
| print("parsing: {}".format(glob_file)) | |
| if weights == "weighted": | |
| weight = pattern.match(glob_file) | |
| if weight and weight.group(2): | |
| print("Using weight: {}".format(weight.group(2))) | |
| weight_list[i] = weight_list[i]*int(weight.group(2)) | |
| else: | |
| print("Using weight: 1") | |
| # sort glob_file by first column, ignoring the first line | |
| lines = open(glob_file).readlines() | |
| lines = [lines[0]] + sorted(lines[1:]) | |
| for e, line in enumerate( lines ): | |
| if i == 0 and e == 0: | |
| outfile.write(line) | |
| if e > 0: | |
| row = line.strip().split(",") | |
| for l in range(1,weight_list[i]+1): | |
| scores[(e,row[0])].append(row[1]) | |
| for j,k in sorted(scores): | |
| outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0])) | |
| print("wrote to {}".format(loc_outfile)) | |
| kaggle_bag(glob_files, loc_outfile, weights=weights_strategy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment