Created
October 20, 2014 17:54
-
-
Save elyase/06ab806eaf2d84871422 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "", | |
| "signature": "sha256:b99bf1f68a92860ccebd342a666077e249dfb487cea12a567d68facfaa39ab39" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "@ morph, for the YSDA ML Trainings 18 October, 2014" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Download data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "data_dir = 'tradeshift/'\n", | |
| "!mkdir {data_dir}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/train.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A06%3A50Z&sr=b&sp=r&sig=cupgPW%2BU6BpdsnrykcEBBRqLEW565pXYQ6k%2FSc0Me1M%3D' -O {data_dir + 'train.csv.gz'}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/test.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A09%3A52Z&sr=b&sp=r&sig=YLQCFyAdhIRnz2o4p24zRssUjHYjQ1xOHuTKFsdLxu8%3D' -O {data_dir + 'test.csv.gz'}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/trainLabels.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A11%3A04Z&sr=b&sp=r&sig=%2Bm9sbZYXOY8L80d1PJEdumGPXvkQby2rpkVOf1fvjUM%3D' -O {data_dir + 'trainLabels.csv.gz'}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Unpack" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "!gunzip {data_dir + '*.gz'}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": {}, | |
| "source": [ | |
| "!ls -l -h {data_dir}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Big Data -- Sample Data!" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import pandas as pd" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 13 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train = pd.read_csv(data_dir + 'train.csv')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 14 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 15, | |
| "text": [ | |
| "(1700000, 146)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 15 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "sample_size = 170000\n", | |
| "ratio = train.shape[0] / sample_size\n", | |
| "\n", | |
| "train_sample = train[\n", | |
| " [hash(id) % ratio == 0 for id in train['id']]\n", | |
| "]\n", | |
| "\n", | |
| "train_sample.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 16, | |
| "text": [ | |
| "(170000, 146)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train_sample.to_csv(data_dir + 'train_sample.csv', index = False)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 17 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "# Free memory\n", | |
| "\n", | |
| "del train" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 18 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Try to make something useful" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "data_dir = 'tradeshift/'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train_sample = pd.read_csv(data_dir + 'train_sample.csv')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "labels = pd.read_csv(data_dir + 'trainLabels.csv')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "labels.columns" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 4, | |
| "text": [ | |
| "Index([u'id', u'y1', u'y2', u'y3', u'y4', u'y5', u'y6', u'y7', u'y8', u'y9', u'y10', u'y11', u'y12', u'y13', u'y14', u'y15', u'y16', u'y17', u'y18', u'y19', u'y20', u'y21', u'y22', u'y23', u'y24', u'y25', u'y26', u'y27', u'y28', u'y29', u'y30', u'y31', u'y32', u'y33'], dtype='object')" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train_with_labels = pd.merge(train_sample, labels, on = 'id')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 5 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "train_with_labels.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 6, | |
| "text": [ | |
| "(170000, 179)" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 6 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from collections import Counter\n", | |
| "\n", | |
| "Counter([name[0] for name in train_with_labels.columns])" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 7, | |
| "text": [ | |
| "Counter({'x': 145, 'y': 33, 'i': 1})" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "del labels\n", | |
| "del train_sample" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 8 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "test = pd.read_csv(data_dir + 'test.csv')" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Categorical values encoding" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from sklearn.feature_extraction import DictVectorizer\n", | |
| "\n", | |
| "X_numerical = []\n", | |
| "X_test_numerical = []\n", | |
| "\n", | |
| "vec = DictVectorizer()\n", | |
| "\n", | |
| "names_categorical = []\n", | |
| "\n", | |
| "train_with_labels.replace('YES', 1, inplace = True)\n", | |
| "train_with_labels.replace('NO', 0, inplace = True)\n", | |
| "train_with_labels.replace('nan', np.NaN, inplace = True)\n", | |
| "\n", | |
| "test.replace('YES', 1, inplace = True)\n", | |
| "test.replace('NO', 0, inplace = True)\n", | |
| "test.replace('nan', np.NaN, inplace = True)\n", | |
| "\n", | |
| "\n", | |
| "for name in train_with_labels.columns : \n", | |
| " if name.startswith('x') :\n", | |
| " column_type, _ = max(Counter(map(lambda x: str(type(x)), train_with_labels[name])).items(), key = lambda x: x[1])\n", | |
| " \n", | |
| " # LOL expression\n", | |
| " if column_type == str(str) :\n", | |
| " train_with_labels[name] = map(str, train_with_labels[name])\n", | |
| " test[name] = map(str, test[name])\n", | |
| "\n", | |
| " names_categorical.append(name)\n", | |
| " print name, len(np.unique(train_with_labels[name]))\n", | |
| " else :\n", | |
| " X_numerical.append(train_with_labels[name].fillna(-999))\n", | |
| " X_test_numerical.append(test[name].fillna(-999))\n", | |
| " \n", | |
| "X_numerical = np.column_stack(X_numerical)\n", | |
| "X_test_numerical = np.column_stack(X_test_numerical)\n", | |
| "\n", | |
| "X_sparse = vec.fit_transform(train_with_labels[names_categorical].T.to_dict().values())\n", | |
| "X_test_sparse = vec.transform(test[names_categorical].T.to_dict().values())\n", | |
| "\n", | |
| "print X_numerical.shape, X_sparse.shape, X_test_numerical.shape, X_test_sparse.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "x3 " | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "40882\n", | |
| "x4" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 5019\n", | |
| "x34" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 48090\n", | |
| "x35" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 6797\n", | |
| "x61" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 78363\n", | |
| "x64" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 49408\n", | |
| "x65" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 7035\n", | |
| "x91" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 27960\n", | |
| "x94" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 37786\n", | |
| "x95" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " 5086\n", | |
| "(170000, 135)" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| " (170000, 306426) (545082, 135) (545082, 306426)\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "X_numerical = np.nan_to_num(X_numerical)\n", | |
| "X_test_numerical = np.nan_to_num(X_test_numerical)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from sklearn.externals import joblib\n", | |
| "\n", | |
| "joblib.dump(\n", | |
| " (X_numerical, X_sparse, X_test_numerical, X_test_sparse),\n", | |
| " data_dir + 'X.dump',\n", | |
| " compress = 1,\n", | |
| ")" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 2, | |
| "metadata": {}, | |
| "source": [ | |
| "Trying to predict something" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Build two level classifier, first train base level" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "from sklearn.metrics import roc_auc_score, f1_score, log_loss, make_scorer\n", | |
| "from sklearn.svm import LinearSVC\n", | |
| "from sklearn.cross_validation import cross_val_score, train_test_split\n", | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "\n", | |
| "log_loss_scorer = make_scorer(log_loss, needs_proba = True)\n", | |
| "\n", | |
| "y_columns = [name for name in train_with_labels.columns if name.startswith('y')]\n", | |
| "\n", | |
| "X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(\n", | |
| " X_numerical, \n", | |
| " X_sparse, \n", | |
| " train_with_labels[y_columns].values,\n", | |
| " test_size = 0.5\n", | |
| ")\n", | |
| "\n", | |
| "X_meta = [] \n", | |
| "X_test_meta = []\n", | |
| "\n", | |
| "print \"Build meta\"\n", | |
| "\n", | |
| "for i in range(y_base.shape[1]) :\n", | |
| " print i\n", | |
| " \n", | |
| " y = y_base[:, i]\n", | |
| " if len(np.unique(y)) == 2 : \n", | |
| " rf = RandomForestClassifier(n_estimators = 10, n_jobs = 1)\n", | |
| " rf.fit(X_numerical_base, y)\n", | |
| " X_meta.append(rf.predict_proba(X_numerical_meta))\n", | |
| " X_test_meta.append(rf.predict_proba(X_test_numerical))\n", | |
| "\n", | |
| " svm = LinearSVC()\n", | |
| " svm.fit(X_sparse_base, y)\n", | |
| " X_meta.append(svm.decision_function(X_sparse_meta))\n", | |
| " X_test_meta.append(svm.decision_function(X_test_sparse))\n", | |
| " \n", | |
| "X_meta = np.column_stack(X_meta)\n", | |
| "X_test_meta = np.column_stack(X_test_meta)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Build meta\n", | |
| "0\n", | |
| "1" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "print X_meta.shape, X_test_meta.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Here train meta level and get predictions for test set" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "p_test = []\n", | |
| "\n", | |
| "for i in range(y_base.shape[1]) :\n", | |
| " y = y_meta[:, i]\n", | |
| "\n", | |
| " constant = Counter(y)\n", | |
| " constant = constant[0] < 4 or constant[1] < 4\n", | |
| " \n", | |
| " predicted = None\n", | |
| " \n", | |
| " if constant :\n", | |
| " # Best constant\n", | |
| " constant_pred = np.mean(list(y_base[:, i]) + list(y_meta[:, i]))\n", | |
| " \n", | |
| " predicted = np.ones(X_test_meta.shape[0]) * constant_pred\n", | |
| " print \"%d is constant like: %f\" % (i, constant_pred)\n", | |
| " else :\n", | |
| " rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n", | |
| " rf.fit(np.hstack([X_meta, X_numerical_meta]), y)\n", | |
| "\n", | |
| " predicted = rf.predict_proba(np.hstack([X_test_meta, X_test_numerical]))\n", | |
| "\n", | |
| " predicted = predicted[:, 1]\n", | |
| " \n", | |
| " rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)\n", | |
| " scores = cross_val_score(rf, np.hstack([X_meta, X_numerical_meta]), y, cv = 4, n_jobs = 1, scoring = log_loss_scorer)\n", | |
| "\n", | |
| " print i, 'RF log-loss: %.4f \u00b1 %.4f, mean = %.6f' %(np.mean(scores), np.std(scores), np.mean(predicted))\n", | |
| "\n", | |
| " \n", | |
| " p_test.append(\n", | |
| " predicted\n", | |
| " )\n", | |
| " \n", | |
| "p_test = np.column_stack(p_test)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Save predictions" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "p_test.shape" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import gzip\n", | |
| "\n", | |
| "def save_predictions(name, ids, predictions) :\n", | |
| " out = gzip.open(name, 'w')\n", | |
| " print >>out, 'id_label,pred'\n", | |
| " for id, id_predictions in zip(test['id'], p_test) :\n", | |
| " for y_id, pred in enumerate(id_predictions) :\n", | |
| " if pred == 0 or pred == 1 :\n", | |
| " pred = str(int(pred))\n", | |
| " else :\n", | |
| " pred = '%.6f' % pred\n", | |
| " print >>out, '%d_y%d,%s' % (id, y_id + 1, pred)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "save_predictions('quick_start.csv.gz', test['id'].values, p_test)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "!ls -l -h quick_start*.csv.gz" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Public result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "heading", | |
| "level": 3, | |
| "metadata": {}, | |
| "source": [ | |
| "Quick start on 10% of train - 0.0212323" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment