nlpjoe · August 16, 2018 08:20
diff --git a/blend.py b/blend.py
 """Kaggle competition: Predicting a Biological Response.

 Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
 [0,1]. The blending scheme is related to the idea Jose H. Solorzano
 presented here:
 http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
 '''You can try this: In one of the 5 folds, train the models, then use
 the results of the models as 'variables' in logistic regression over
 the validation data of that fold'''. Or at least this is the
 implementation of my understanding of that idea :-)

 The predictions are saved in test.csv. The code below created my best
 submission to the competition:
 - public score (25%): 0.43464
 - private score (75%): 0.37751
 - final rank on the private leaderboard: 17th over 711 teams :-)

 Note: if you increase the number of estimators of the classifiers,
 e.g. n_estimators=1000, you get a better score/rank on the private
 test set.

 Copyright 2012, Emanuele Olivetti.
 BSD license, 3 clauses.
 """

 from __future__ import division
 import numpy as np
 import load_data
 from sklearn.cross_validation import StratifiedKFold
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression


 def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) +
                     (1.0 - actual) * np.log(1.0 - attempt))


 if __name__ == '__main__':

    np.random.seed(0)  # seed to shuffle the train set

    n_folds = 10
    verbose = True
    shuffle = False

    X, y, X_submission = load_data.load()

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

    print "Creating train and test sets for blending."

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

    print "Saving Results."
    tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
    np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
               header='MoleculeId,PredictedProbability', comments='')
diff --git a/blend_proba.py b/blend_proba.py
 from sklearn import cross_validation
 from sklearn.metrics import log_loss, accuracy_score
 import numpy as np
 import pandas as pd
 import random
 import md5
 import json

 def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="",
                save_test_only="", seed=300373, save_params="",
                clf_name="XX", generalizers_params=[], minimal_loss=0,
                return_score=False, minimizer="log_loss"):
  print("\nBlending with classifier:\n\t{}".format(clf))
  folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
  print(X_train.shape)
  dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))

  #iterate through train set and train - predict folds
  loss = 0
  for i, (train_index, test_index) in enumerate( folds ):
    print("Train Fold {}/{}}".format(i+1,nfolds))
    fold_X_train = X_train[train_index]
    fold_y_train = y[train_index]
    fold_X_test = X_train[test_index]
    fold_y_test = y[test_index]
    clf.fit(fold_X_train, fold_y_train)

    fold_preds = clf.predict_proba(fold_X_test)
    print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds)))
    dataset_blend_train[test_index] = fold_preds
    if minimizer == "log_loss":
      loss += log_loss(fold_y_test,fold_preds)
    if minimizer == "accuracy":
      fold_preds_a = np.argmax(fold_preds, axis=1)
      loss += accuracy_score(fold_y_test,fold_preds_a)
    #fold_preds = clf.predict(fold_X_test)

    #loss += accuracy_score(fold_y_test,fold_preds)

    if minimal_loss > 0 and loss > minimal_loss and i == 0:
      return False, False
    fold_preds = np.argmax(fold_preds, axis=1)
    print("Accuracy:      {}".format(accuracy_score(fold_y_test,fold_preds)))
  avg_loss = loss / float(i+1)
  print("\nAverage:\t{}\n".format(avg_loss))
  #predict test set (better to take average on all folds, but this is quicker)
  print("Test Fold 1/1")
  clf.fit(X_train, y)
  dataset_blend_test = clf.predict_proba(X_test)

  if clf_name == "XX":
    clf_name = str(clf)[1:3]

  if len(save_preds)>0:
    id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
    print("storing meta predictions at: {}"{}ave_preds)
    np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train))
    np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test))

  if len(save_test_only)>0:
    id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
    print("storing meta predictions at: {}"{}ave_test_only)

    dataset_blend_test = clf.predict(X_test)
    np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test))
    d = {}
    d["stacker"] = clf.get_params()
    d["generalizers"] = generalizers_params
    with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f:
      json.dump(d, f)

  if len(save_params)>0:
    id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
    d = {}
    d["name"] = clf_name
    d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
    d["generalizers"] = generalizers_params
    with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f:
      json.dump(d, f)

  if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
    if return_score:
      return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
    else:
      return dataset_blend_train[:,1], dataset_blend_test[:,1]
  else:
    if return_score:
      return dataset_blend_train, dataset_blend_test, avg_loss
    else:
      return dataset_blend_train, dataset_blend_test
diff --git a/correlations.py b/correlations.py
 import pandas as pd
 import sys

 first_file = sys.argv[1]
 second_file = sys.argv[2]

 def corr(first_file, second_file):
  first_df = pd.read_csv(first_file,index_col=0)
  second_df = pd.read_csv(second_file,index_col=0)
  # assuming first column is `prediction_id` and second column is `prediction`
  prediction = first_df.columns[0]
  # correlation
  print(first_df[prediction])
  print("Finding correlation between: {} and {}".format(first_file,second_file))
  print("Column to be measured: {}".format(prediction))
  print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson')))
  print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall')))
  print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman')))

 corr(first_file, second_file)
diff --git a/kaggle_avg.py b/kaggle_avg.py
 from collections import defaultdict
 from glob import glob
 import sys

 glob_files = sys.argv[1]
 loc_outfile = sys.argv[2]

 def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
  if method == "average":
    scores = defaultdict(float)
  with open(loc_outfile,"w") as outfile:
    for i, glob_file in enumerate( glob(glob_files) ):
      print("parsing: {}".format(glob_file))
      # sort glob_file by first column, ignoring the first line
      lines = open(glob_file).readlines()
      lines = [lines[0]] + sorted(lines[1:])
      for e, line in enumerate( lines ):
        if i == 0 and e == 0:
          outfile.write(line)
        if e > 0:
          row = line.strip().split(",")
          scores[(e,row[0])] += float(row[1])
    for j,k in sorted(scores):
      outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1)))
    print("wrote to {}".format(loc_outfile))

 kaggle_bag(glob_files, loc_outfile)
diff --git a/kaggle_geomean.py b/kaggle_geomean.py
 from __future__ import division
 from collections import defaultdict
 from glob import glob
 import sys
 import math

 glob_files = sys.argv[1]
 loc_outfile = sys.argv[2]

 def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
  if method == "average":
    scores = defaultdict(float)
  with open(loc_outfile,"w") as outfile:
    for i, glob_file in enumerate( glob(glob_files) ):
      print("parsing: {}".format(glob_file))
      # sort glob_file by first column, ignoring the first line
      lines = open(glob_file).readlines()
      lines = [lines[0]] + sorted(lines[1:])
      for e, line in enumerate( lines ):
        if i == 0 and e == 0:
          outfile.write(line)
        if e > 0:
          row = line.strip().split(",")
          if scores[(e,row[0])] == 0:
            scores[(e,row[0])] = 1
          scores[(e,row[0])] *= float(row[1])
    for j,k in sorted(scores):
      outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1))))
    print("wrote to {}".format(loc_outfile))

 kaggle_bag(glob_files, loc_outfile)
diff --git a/kaggle_rankavg.py b/kaggle_rankavg.py
 from __future__ import division
 from collections import defaultdict
 from glob import glob
 import sys

 glob_files = sys.argv[1]
 loc_outfile = sys.argv[2]

 def kaggle_bag(glob_files, loc_outfile):
  with open(loc_outfile,"w") as outfile:
    all_ranks = defaultdict(list)
    for i, glob_file in enumerate( glob(glob_files) ):
      file_ranks = []
      print("parsing: {}".format(glob_file))
      # sort glob_file by first column, ignoring the first line
      lines = open(glob_file).readlines()
      lines = [lines[0]] + sorted(lines[1:])
      for e, line in enumerate( lines ):
        if e == 0 and i == 0:
          outfile.write( line )
        elif e > 0:
          r = line.strip().split(",")
          file_ranks.append( (float(r[1]), e, r[0]) )
      for rank, item in enumerate( sorted(file_ranks) ):
        all_ranks[(item[1],item[2])].append(rank)
    average_ranks = []
    for k in sorted(all_ranks):
      average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))
    ranked_ranks = []
    for rank, k in enumerate(sorted(average_ranks)):
      ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1)))
    for k in sorted(ranked_ranks):
      outfile.write("%s,%s\n"%(k[1],k[2]))
    print("wrote to {}".format(loc_outfile))

 kaggle_bag(glob_files, loc_outfile)
diff --git a/kaggle_vote.py b/kaggle_vote.py
 from collections import defaultdict, Counter
 from glob import glob
 import sys
 import re

 glob_files = sys.argv[1]
 loc_outfile = sys.argv[2]
 weights_strategy = "uniform"
 if len(sys.argv) == 4:
  weights_strategy = sys.argv[3]

 def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
  pattern = re.compile(r"(.)*_[w|W](\d*)_[.]*")
  if method == "average":
    scores = defaultdict(list)
  with open(loc_outfile,"w") as outfile:
    #weight_list may be usefull using a different method
    weight_list = [1]*len(glob(glob_files))
    for i, glob_file in enumerate( glob(glob_files) ):
      print("parsing: {}".format(glob_file))
      if weights == "weighted":
         weight = pattern.match(glob_file)
         if weight and weight.group(2):
            print("Using weight: {}".format(weight.group(2)))
            weight_list[i] = weight_list[i]*int(weight.group(2))
         else:
            print("Using weight: 1")
      # sort glob_file by first column, ignoring the first line
      lines = open(glob_file).readlines()
      lines = [lines[0]] + sorted(lines[1:])
      for e, line in enumerate( lines ):
        if i == 0 and e == 0:
          outfile.write(line)
        if e > 0:
          row = line.strip().split(",")
          for l in range(1,weight_list[i]+1):
            scores[(e,row[0])].append(row[1])
    for j,k in sorted(scores):
      outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0]))
    print("wrote to {}".format(loc_outfile))

 kaggle_bag(glob_files, loc_outfile, weights=weights_strategy)
	"""Kaggle competition: Predicting a Biological Response.

	Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
	[0,1]. The blending scheme is related to the idea Jose H. Solorzano
	presented here:
	http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
	'''You can try this: In one of the 5 folds, train the models, then use
	the results of the models as 'variables' in logistic regression over
	the validation data of that fold'''. Or at least this is the
	implementation of my understanding of that idea :-)

	The predictions are saved in test.csv. The code below created my best
	submission to the competition:
	- public score (25%): 0.43464
	- private score (75%): 0.37751
	- final rank on the private leaderboard: 17th over 711 teams :-)

	Note: if you increase the number of estimators of the classifiers,
	e.g. n_estimators=1000, you get a better score/rank on the private
	test set.

	Copyright 2012, Emanuele Olivetti.
	BSD license, 3 clauses.
	"""

	from __future__ import division
	import numpy as np
	import load_data
	from sklearn.cross_validation import StratifiedKFold
	from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.linear_model import LogisticRegression


	def logloss(attempt, actual, epsilon=1.0e-15):
	"""Logloss, i.e. the score of the bioresponse competition.
	"""
	attempt = np.clip(attempt, epsilon, 1.0-epsilon)
	return - np.mean(actual * np.log(attempt) +
	(1.0 - actual) * np.log(1.0 - attempt))


	if __name__ == '__main__':

	np.random.seed(0) # seed to shuffle the train set

	n_folds = 10
	verbose = True
	shuffle = False

	X, y, X_submission = load_data.load()

	if shuffle:
	idx = np.random.permutation(y.size)
	X = X[idx]
	y = y[idx]

	skf = list(StratifiedKFold(y, n_folds))

	clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
	RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
	ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
	ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
	GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

	print "Creating train and test sets for blending."

	dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
	dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

	for j, clf in enumerate(clfs):
	print j, clf
	dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
	for i, (train, test) in enumerate(skf):
	print "Fold", i
	X_train = X[train]
	y_train = y[train]
	X_test = X[test]
	y_test = y[test]
	clf.fit(X_train, y_train)
	y_submission = clf.predict_proba(X_test)[:, 1]
	dataset_blend_train[test, j] = y_submission
	dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
	dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

	print
	print "Blending."
	clf = LogisticRegression()
	clf.fit(dataset_blend_train, y)
	y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

	print "Linear stretch of predictions to [0,1]"
	y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

	print "Saving Results."
	tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
	np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
	header='MoleculeId,PredictedProbability', comments='')
	from sklearn import cross_validation
	from sklearn.metrics import log_loss, accuracy_score
	import numpy as np
	import pandas as pd
	import random
	import md5
	import json

	def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="",
	save_test_only="", seed=300373, save_params="",
	clf_name="XX", generalizers_params=[], minimal_loss=0,
	return_score=False, minimizer="log_loss"):
	print("\nBlending with classifier:\n\t{}".format(clf))
	folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
	print(X_train.shape)
	dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))

	#iterate through train set and train - predict folds
	loss = 0
	for i, (train_index, test_index) in enumerate( folds ):
	print("Train Fold {}/{}}".format(i+1,nfolds))
	fold_X_train = X_train[train_index]
	fold_y_train = y[train_index]
	fold_X_test = X_train[test_index]
	fold_y_test = y[test_index]
	clf.fit(fold_X_train, fold_y_train)

	fold_preds = clf.predict_proba(fold_X_test)
	print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds)))
	dataset_blend_train[test_index] = fold_preds
	if minimizer == "log_loss":
	loss += log_loss(fold_y_test,fold_preds)
	if minimizer == "accuracy":
	fold_preds_a = np.argmax(fold_preds, axis=1)
	loss += accuracy_score(fold_y_test,fold_preds_a)
	#fold_preds = clf.predict(fold_X_test)

	#loss += accuracy_score(fold_y_test,fold_preds)

	if minimal_loss > 0 and loss > minimal_loss and i == 0:
	return False, False
	fold_preds = np.argmax(fold_preds, axis=1)
	print("Accuracy: {}".format(accuracy_score(fold_y_test,fold_preds)))
	avg_loss = loss / float(i+1)
	print("\nAverage:\t{}\n".format(avg_loss))
	#predict test set (better to take average on all folds, but this is quicker)
	print("Test Fold 1/1")
	clf.fit(X_train, y)
	dataset_blend_test = clf.predict_proba(X_test)

	if clf_name == "XX":
	clf_name = str(clf)[1:3]

	if len(save_preds)>0:
	id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
	print("storing meta predictions at: {}"{}ave_preds)
	np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train))
	np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test))

	if len(save_test_only)>0:
	id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
	print("storing meta predictions at: {}"{}ave_test_only)

	dataset_blend_test = clf.predict(X_test)
	np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test))
	d = {}
	d["stacker"] = clf.get_params()
	d["generalizers"] = generalizers_params
	with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f:
	json.dump(d, f)

	if len(save_params)>0:
	id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
	d = {}
	d["name"] = clf_name
	d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
	d["generalizers"] = generalizers_params
	with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f:
	json.dump(d, f)

	if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
	if return_score:
	return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
	else:
	return dataset_blend_train[:,1], dataset_blend_test[:,1]
	else:
	if return_score:
	return dataset_blend_train, dataset_blend_test, avg_loss
	else:
	return dataset_blend_train, dataset_blend_test
	import pandas as pd
	import sys

	first_file = sys.argv[1]
	second_file = sys.argv[2]

	def corr(first_file, second_file):
	first_df = pd.read_csv(first_file,index_col=0)
	second_df = pd.read_csv(second_file,index_col=0)
	# assuming first column is `prediction_id` and second column is `prediction`
	prediction = first_df.columns[0]
	# correlation
	print(first_df[prediction])
	print("Finding correlation between: {} and {}".format(first_file,second_file))
	print("Column to be measured: {}".format(prediction))
	print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson')))
	print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall')))
	print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman')))

	corr(first_file, second_file)
	from collections import defaultdict
	from glob import glob
	import sys

	glob_files = sys.argv[1]
	loc_outfile = sys.argv[2]

	def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
	if method == "average":
	scores = defaultdict(float)
	with open(loc_outfile,"w") as outfile:
	for i, glob_file in enumerate( glob(glob_files) ):
	print("parsing: {}".format(glob_file))
	# sort glob_file by first column, ignoring the first line
	lines = open(glob_file).readlines()
	lines = [lines[0]] + sorted(lines[1:])
	for e, line in enumerate( lines ):
	if i == 0 and e == 0:
	outfile.write(line)
	if e > 0:
	row = line.strip().split(",")
	scores[(e,row[0])] += float(row[1])
	for j,k in sorted(scores):
	outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1)))
	print("wrote to {}".format(loc_outfile))

	kaggle_bag(glob_files, loc_outfile)
	from collections import defaultdict, Counter
	from glob import glob
	import sys
	import re

	glob_files = sys.argv[1]
	loc_outfile = sys.argv[2]
	weights_strategy = "uniform"
	if len(sys.argv) == 4:
	weights_strategy = sys.argv[3]

	def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
	pattern = re.compile(r"(.)_[w\|W](\d)_[.]*")
	if method == "average":
	scores = defaultdict(list)
	with open(loc_outfile,"w") as outfile:
	#weight_list may be usefull using a different method
	weight_list = [1]*len(glob(glob_files))
	for i, glob_file in enumerate( glob(glob_files) ):
	print("parsing: {}".format(glob_file))
	if weights == "weighted":
	weight = pattern.match(glob_file)
	if weight and weight.group(2):
	print("Using weight: {}".format(weight.group(2)))
	weight_list[i] = weight_list[i]*int(weight.group(2))
	else:
	print("Using weight: 1")
	# sort glob_file by first column, ignoring the first line
	lines = open(glob_file).readlines()
	lines = [lines[0]] + sorted(lines[1:])
	for e, line in enumerate( lines ):
	if i == 0 and e == 0:
	outfile.write(line)
	if e > 0:
	row = line.strip().split(",")
	for l in range(1,weight_list[i]+1):
	scores[(e,row[0])].append(row[1])
	for j,k in sorted(scores):
	outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0]))
	print("wrote to {}".format(loc_outfile))

	kaggle_bag(glob_files, loc_outfile, weights=weights_strategy)