Created
October 19, 2014 05:15
-
-
Save lxc-xx/298446b12182c51752d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:b99bf1f68a92860ccebd342a666077e249dfb487cea12a567d68facfaa39ab39" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"@ morph, for the YSDA ML Trainings 18 October, 2014" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Download data" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"data_dir = 'tradeshift/'\n", | |
"!mkdir {data_dir}" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/train.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A06%3A50Z&sr=b&sp=r&sig=cupgPW%2BU6BpdsnrykcEBBRqLEW565pXYQ6k%2FSc0Me1M%3D' -O {data_dir + 'train.csv.gz'}" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/test.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A09%3A52Z&sr=b&sp=r&sig=YLQCFyAdhIRnz2o4p24zRssUjHYjQ1xOHuTKFsdLxu8%3D' -O {data_dir + 'test.csv.gz'}" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/trainLabels.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A11%3A04Z&sr=b&sp=r&sig=%2Bm9sbZYXOY8L80d1PJEdumGPXvkQby2rpkVOf1fvjUM%3D' -O {data_dir + 'trainLabels.csv.gz'}" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Unpack" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"%%time\n", | |
"\n", | |
"!gunzip {data_dir + '*.gz'}" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"!ls -l -h {data_dir}" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Big Data -- Sample Data!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train = pd.read_csv(data_dir + 'train.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"(1700000, 146)" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sample_size = 170000\n", | |
"ratio = train.shape[0] / sample_size\n", | |
"\n", | |
"train_sample = train[\n", | |
" [hash(id) % ratio == 0 for id in train['id']]\n", | |
"]\n", | |
"\n", | |
"train_sample.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"(170000, 146)" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train_sample.to_csv(data_dir + 'train_sample.csv', index = False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Free memory\n", | |
"\n", | |
"del train" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Try to make something useful" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"\n", | |
"data_dir = 'tradeshift/'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train_sample = pd.read_csv(data_dir + 'train_sample.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"labels = pd.read_csv(data_dir + 'trainLabels.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"labels.columns" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"Index([u'id', u'y1', u'y2', u'y3', u'y4', u'y5', u'y6', u'y7', u'y8', u'y9', u'y10', u'y11', u'y12', u'y13', u'y14', u'y15', u'y16', u'y17', u'y18', u'y19', u'y20', u'y21', u'y22', u'y23', u'y24', u'y25', u'y26', u'y27', u'y28', u'y29', u'y30', u'y31', u'y32', u'y33'], dtype='object')" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train_with_labels = pd.merge(train_sample, labels, on = 'id')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"train_with_labels.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"(170000, 179)" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from collections import Counter\n", | |
"\n", | |
"Counter([name[0] for name in train_with_labels.columns])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"Counter({'x': 145, 'y': 33, 'i': 1})" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"del labels\n", | |
"del train_sample" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test = pd.read_csv(data_dir + 'test.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Categorical values encoding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.feature_extraction import DictVectorizer\n", | |
"\n", | |
"X_numerical = []\n", | |
"X_test_numerical = []\n", | |
"\n", | |
"vec = DictVectorizer()\n", | |
"\n", | |
"names_categorical = []\n", | |
"\n", | |
"train_with_labels.replace('YES', 1, inplace = True)\n", | |
"train_with_labels.replace('NO', 0, inplace = True)\n", | |
"train_with_labels.replace('nan', np.NaN, inplace = True)\n", | |
"\n", | |
"test.replace('YES', 1, inplace = True)\n", | |
"test.replace('NO', 0, inplace = True)\n", | |
"test.replace('nan', np.NaN, inplace = True)\n", | |
"\n", | |
"\n", | |
"for name in train_with_labels.columns : \n", | |
" if name.startswith('x') :\n", | |
" column_type, _ = max(Counter(map(lambda x: str(type(x)), train_with_labels[name])).items(), key = lambda x: x[1])\n", | |
" \n", | |
" # LOL expression\n", | |
" if column_type == str(str) :\n", | |
" train_with_labels[name] = map(str, train_with_labels[name])\n", | |
" test[name] = map(str, test[name])\n", | |
"\n", | |
" names_categorical.append(name)\n", | |
" print name, len(np.unique(train_with_labels[name]))\n", | |
" else :\n", | |
" X_numerical.append(train_with_labels[name].fillna(-999))\n", | |
" X_test_numerical.append(test[name].fillna(-999))\n", | |
" \n", | |
"X_numerical = np.column_stack(X_numerical)\n", | |
"X_test_numerical = np.column_stack(X_test_numerical)\n", | |
"\n", | |
"X_sparse = vec.fit_transform(train_with_labels[names_categorical].T.to_dict().values())\n", | |
"X_test_sparse = vec.transform(test[names_categorical].T.to_dict().values())\n", | |
"\n", | |
"print X_numerical.shape, X_sparse.shape, X_test_numerical.shape, X_test_sparse.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"x3 " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"40882\n", | |
"x4" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 5019\n", | |
"x34" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 48090\n", | |
"x35" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 6797\n", | |
"x61" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 78363\n", | |
"x64" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 49408\n", | |
"x65" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 7035\n", | |
"x91" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 27960\n", | |
"x94" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 37786\n", | |
"x95" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 5086\n", | |
"(170000, 135)" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" (170000, 306426) (545082, 135) (545082, 306426)\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"X_numerical = np.nan_to_num(X_numerical)\n", | |
"X_test_numerical = np.nan_to_num(X_test_numerical)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.externals import joblib\n", | |
"\n", | |
"joblib.dump(\n", | |
" (X_numerical, X_sparse, X_test_numerical, X_test_sparse),\n", | |
" data_dir + 'X.dump',\n", | |
" compress = 1,\n", | |
")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"Trying to predict something" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Build two level classifier, first train base level" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.metrics import roc_auc_score, f1_score, log_loss, make_scorer\n", | |
"from sklearn.svm import LinearSVC\n", | |
"from sklearn.cross_validation import cross_val_score, train_test_split\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"\n", | |
"log_loss_scorer = make_scorer(log_loss, needs_proba = True)\n", | |
"\n", | |
"y_columns = [name for name in train_with_labels.columns if name.startswith('y')]\n", | |
"\n", | |
"X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(\n", | |
" X_numerical, \n", | |
" X_sparse, \n", | |
" train_with_labels[y_columns].values,\n", | |
" test_size = 0.5\n", | |
")\n", | |
"\n", | |
"X_meta = [] \n", | |
"X_test_meta = []\n", | |
"\n", | |
"print \"Build meta\"\n", | |
"\n", | |
"for i in range(y_base.shape[1]) :\n", | |
" print i\n", | |
" \n", | |
" y = y_base[:, i]\n", | |
" if len(np.unique(y)) == 2 : \n", | |
" rf = RandomForestClassifier(n_estimators = 10, n_jobs = 1)\n", | |
" rf.fit(X_numerical_base, y)\n", | |
" X_meta.append(rf.predict_proba(X_numerical_meta))\n", | |
" X_test_meta.append(rf.predict_proba(X_test_numerical))\n", | |
"\n", | |
" svm = LinearSVC()\n", | |
" svm.fit(X_sparse_base, y)\n", | |
" X_meta.append(svm.decision_function(X_sparse_meta))\n", | |
" X_test_meta.append(svm.decision_function(X_test_sparse))\n", | |
" \n", | |
"X_meta = np.column_stack(X_meta)\n", | |
"X_test_meta = np.column_stack(X_test_meta)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Build meta\n", | |
"0\n", | |
"1" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print X_meta.shape, X_test_meta.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Here train meta level and get predictions for test set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p_test = []\n", | |
"\n", | |
"for i in range(y_base.shape[1]) :\n", | |
" y = y_meta[:, i]\n", | |
"\n", | |
" constant = Counter(y)\n", | |
" constant = constant[0] < 4 or constant[1] < 4\n", | |
" \n", | |
" predicted = None\n", | |
" \n", | |
" if constant :\n", | |
" # Best constant\n", | |
" constant_pred = np.mean(list(y_base[:, i]) + list(y_meta[:, i]))\n", | |
" \n", | |
" predicted = np.ones(X_test_meta.shape[0]) * constant_pred\n", | |
" print \"%d is constant like: %f\" % (i, constant_pred)\n", | |
" else :\n", | |
" rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n", | |
" rf.fit(np.hstack([X_meta, X_numerical_meta]), y)\n", | |
"\n", | |
" predicted = rf.predict_proba(np.hstack([X_test_meta, X_test_numerical]))\n", | |
"\n", | |
" predicted = predicted[:, 1]\n", | |
" \n", | |
" rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n", | |
" scores = cross_val_score(rf, np.hstack([X_meta, X_numerical_meta]), y, cv = 4, n_jobs = 1, scoring = log_loss_scorer)\n", | |
"\n", | |
" print i, 'RF log-loss: %.4f \u00b1 %.4f, mean = %.6f' %(np.mean(scores), np.std(scores), np.mean(predicted))\n", | |
"\n", | |
" \n", | |
" p_test.append(\n", | |
" predicted\n", | |
" )\n", | |
" \n", | |
"p_test = np.column_stack(p_test)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Save predictions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p_test.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import gzip\n", | |
"\n", | |
"def save_predictions(name, ids, predictions) :\n", | |
" out = gzip.open(name, 'w')\n", | |
" print >>out, 'id_label,pred'\n", | |
" for id, id_predictions in zip(test['id'], p_test) :\n", | |
" for y_id, pred in enumerate(id_predictions) :\n", | |
" if pred == 0 or pred == 1 :\n", | |
" pred = str(int(pred))\n", | |
" else :\n", | |
" pred = '%.6f' % pred\n", | |
" print >>out, '%d_y%d,%s' % (id, y_id + 1, pred)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"save_predictions('quick_start.csv.gz', test['id'].values, p_test)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!ls -l -h quick_start*.csv.gz" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Public result" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"Quick start on 10% of train - 0.0212323" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment