Created
March 23, 2015 17:19
-
-
Save chrisdubois/6b93a8028f4dc40cab49 to your computer and use it in GitHub Desktop.
Starter code for Otto Group Product Classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import graphlab as gl | |
import math | |
import random | |
train = gl.SFrame.read_csv('data/train.csv') | |
test = gl.SFrame.read_csv('data/test.csv') | |
del train['id'] | |
def make_submission(m, test, filename): | |
preds = m.predict_topk(test, output_type='probability', k=9) | |
preds['id'] = preds['id'].astype(int) + 1 | |
preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') | |
preds = preds.sort('id') | |
preds.save(filename) | |
def multiclass_logloss(model, test): | |
preds = model.predict_topk(test, output_type='probability', k=9) | |
preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') | |
preds['id'] = preds['id'].astype(int) + 1 | |
preds = preds.sort('id') | |
preds['target'] = test['target'] | |
neg_log_loss = 0 | |
for row in preds: | |
label = row['target'] | |
neg_log_loss += - math.log(row[label]) | |
return neg_log_loss / preds.num_rows() | |
def shuffle(sf): | |
sf['_id'] = [random.random() for i in xrange(sf.num_rows())] | |
sf = sf.sort('_id') | |
del sf['_id'] | |
return sf | |
def evaluate_logloss(model, train, valid): | |
return {'train_logloss': multiclass_logloss(model, train), | |
'valid_logloss': multiclass_logloss(model, valid)} | |
params = {'target': 'target', | |
'max_iterations': 250, | |
'max_depth': 10, | |
'min_child_weight': 4, | |
'row_subsample': .9, | |
'min_loss_reduction': 1, | |
'column_subsample': .8, | |
'validation_set': None} | |
train = shuffle(train) | |
# Check performance on internal validation set | |
tr, va = train.random_split(.8) | |
m = gl.boosted_trees_classifier.create(tr, **params) | |
print evaluate_logloss(m, tr, va) | |
# Make final submission by using full training set | |
m = gl.boosted_trees_classifier.create(train, **params) | |
make_submission(m, test, 'submission.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment