Created
October 4, 2018 15:52
-
-
Save jorisvandenbossche/7ff5ad39da2b306e8e3dc4f91b534ea0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "%matplotlib inline\n\nimport pandas as pd", | |
| "execution_count": 7, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "import problem", | |
| "execution_count": 8, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "data_train, y_train = problem.get_train_data()", | |
| "execution_count": 9, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "data_test, y_test = problem.get_test_data()", | |
| "execution_count": 10, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "import sys\nsys.path.insert(0, './submissions/starting_kit/')", | |
| "execution_count": 12, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "from feature_extractor import FeatureExtractor", | |
| "execution_count": 13, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "fe = FeatureExtractor()", | |
| "execution_count": 14, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "scrolled": false, | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "X_train_t = fe.transform(data_train)\nX_test_t = fe.transform(data_test)", | |
| "execution_count": 16, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report, confusion_matrix", | |
| "execution_count": 17, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "model = RandomForestClassifier(n_jobs=-1)", | |
| "execution_count": 18, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "model.fit(X_train_t, y_train, n_estimators=50)", | |
| "execution_count": 19, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": "/home/joris/scipy/scikit-learn/sklearn/ensemble/forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n", | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 19, | |
| "data": { | |
| "text/plain": "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n oob_score=False, random_state=None, verbose=0,\n warm_start=False)" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "model.score(X_test_t, y_test)", | |
| "execution_count": 23, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 23, | |
| "data": { | |
| "text/plain": "0.94548435113389828" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "y_pred = model.predict(X_test_t)", | |
| "execution_count": 20, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "print(classification_report(y_test, y_pred))", | |
| "execution_count": 22, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": " precision recall f1-score support\n\n 0 0.95 1.00 0.97 191755\n 1 0.79 0.25 0.39 13819\n\n micro avg 0.95 0.95 0.95 205574\n macro avg 0.87 0.62 0.68 205574\nweighted avg 0.94 0.95 0.93 205574\n\n", | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "y_proba = model.predict_proba(X_test_t)", | |
| "execution_count": 24, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "from problem import turnPredictionToEventList", | |
| "execution_count": 25, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "y = pd.Series(y_proba[:, 1], index=data_test.index)", | |
| "execution_count": 27, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "event_pred = turnPredictionToEventList(y)", | |
| "execution_count": 28, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "event_true = turnPredictionToEventList(y_test)", | |
| "execution_count": 29, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "from problem import overlapWithList, find\nimport datetime\n\ndef precision(event_true, event_pred):\n FP = [x for x in event_pred\n if max(overlapWithList(x, event_true, percent=True)) < 0.4]\n FP_too_short = [x for x in FP\n if x.duration < datetime.timedelta(hours=2.5)]\n for event in FP_too_short:\n FP.remove(event)\n score = 1-len(FP)/len(event_pred)\n return score\n\ndef recall(event_true, event_pred):\n FN = 0\n for event in event_true:\n corresponding = find(event, event_pred, 0.5, 'best')\n if corresponding is None:\n FN += 1\n score = 1-FN/len(event_true)\n return score\n", | |
| "execution_count": 33, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "precision(event_true, event_pred)", | |
| "execution_count": 34, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 34, | |
| "data": { | |
| "text/plain": "0.9090909090909091" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "recall(event_true, event_pred)", | |
| "execution_count": 35, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 35, | |
| "data": { | |
| "text/plain": "0.0535714285714286" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "from problem import overlapWithList, find\nimport datetime\n\ndef precision2(event_true, event_pred):\n FP = [x for x in event_pred\n if max(overlapWithList(x, event_true, percent=True)) < 0.4]\n FP_too_short = [x for x in FP\n if x.duration < datetime.timedelta(hours=2.5)]\n for event in FP_too_short:\n FP.remove(event)\n return FP\n # score = 1-len(FP)/len(event_pred)\n # return score\n\ndef recall2(event_true, event_pred):\n FN = 0\n for event in event_true:\n corresponding = find(event, event_pred, 0.5, 'best')\n if corresponding is None:\n FN += 1\n return FN\n #score = 1-FN/len(event_true)\n #return score\n", | |
| "execution_count": 37, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "FP = precision2(event_true, event_pred)", | |
| "execution_count": 39, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "FN = recall2(event_true, event_pred)", | |
| "execution_count": 38, | |
| "outputs": [] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "len(FP)", | |
| "execution_count": 40, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 40, | |
| "data": { | |
| "text/plain": "39" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "FN", | |
| "execution_count": 42, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 42, | |
| "data": { | |
| "text/plain": "106" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "n_Pred = len(event_pred)\nn_Pred", | |
| "execution_count": 45, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 45, | |
| "data": { | |
| "text/plain": "429" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "n_True = len(event_true)\nn_True", | |
| "execution_count": 47, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 47, | |
| "data": { | |
| "text/plain": "112" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "* FN + TP = n_True \n* TP + FP = n_Pred" | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "TP = n_True - FN\nTP", | |
| "execution_count": 48, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 48, | |
| "data": { | |
| "text/plain": "6" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "TP + len(FP)", | |
| "execution_count": 50, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 50, | |
| "data": { | |
| "text/plain": "45" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "TP = n_Pred - len(FP)\nTP", | |
| "execution_count": 52, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "execution_count": 52, | |
| "data": { | |
| "text/plain": "390" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "metadata": { | |
| "trusted": true | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": { | |
| "toc": { | |
| "navigate_menu": true, | |
| "toc_window_display": false, | |
| "sideBar": true, | |
| "toc_cell": false, | |
| "threshold": 6, | |
| "toc_section_display": "block", | |
| "number_sections": true | |
| }, | |
| "language_info": { | |
| "name": "python", | |
| "pygments_lexer": "ipython3", | |
| "mimetype": "text/x-python", | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "version": "3.5.5", | |
| "nbconvert_exporter": "python" | |
| }, | |
| "nav_menu": {}, | |
| "kernelspec": { | |
| "name": "dev", | |
| "display_name": "Python 3 (dev)", | |
| "language": "python" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment