Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save jorisvandenbossche/7ff5ad39da2b306e8e3dc4f91b534ea0 to your computer and use it in GitHub Desktop.

Select an option

Save jorisvandenbossche/7ff5ad39da2b306e8e3dc4f91b534ea0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "%matplotlib inline\n\nimport pandas as pd",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import problem",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "data_train, y_train = problem.get_train_data()",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "data_test, y_test = problem.get_test_data()",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import sys\nsys.path.insert(0, './submissions/starting_kit/')",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from feature_extractor import FeatureExtractor",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "fe = FeatureExtractor()",
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"scrolled": false,
"trusted": true
},
"cell_type": "code",
"source": "X_train_t = fe.transform(data_train)\nX_test_t = fe.transform(data_test)",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report, confusion_matrix",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model = RandomForestClassifier(n_jobs=-1)",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model.fit(X_train_t, y_train, n_estimators=50)",
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"text": "/home/joris/scipy/scikit-learn/sklearn/ensemble/forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
"name": "stderr"
},
{
"output_type": "execute_result",
"execution_count": 19,
"data": {
"text/plain": "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n max_depth=None, max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,\n oob_score=False, random_state=None, verbose=0,\n warm_start=False)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model.score(X_test_t, y_test)",
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 23,
"data": {
"text/plain": "0.94548435113389828"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y_pred = model.predict(X_test_t)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(classification_report(y_test, y_pred))",
"execution_count": 22,
"outputs": [
{
"output_type": "stream",
"text": " precision recall f1-score support\n\n 0 0.95 1.00 0.97 191755\n 1 0.79 0.25 0.39 13819\n\n micro avg 0.95 0.95 0.95 205574\n macro avg 0.87 0.62 0.68 205574\nweighted avg 0.94 0.95 0.93 205574\n\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y_proba = model.predict_proba(X_test_t)",
"execution_count": 24,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": ""
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from problem import turnPredictionToEventList",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "y = pd.Series(y_proba[:, 1], index=data_test.index)",
"execution_count": 27,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "event_pred = turnPredictionToEventList(y)",
"execution_count": 28,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "event_true = turnPredictionToEventList(y_test)",
"execution_count": 29,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from problem import overlapWithList, find\nimport datetime\n\ndef precision(event_true, event_pred):\n FP = [x for x in event_pred\n if max(overlapWithList(x, event_true, percent=True)) < 0.4]\n FP_too_short = [x for x in FP\n if x.duration < datetime.timedelta(hours=2.5)]\n for event in FP_too_short:\n FP.remove(event)\n score = 1-len(FP)/len(event_pred)\n return score\n\ndef recall(event_true, event_pred):\n FN = 0\n for event in event_true:\n corresponding = find(event, event_pred, 0.5, 'best')\n if corresponding is None:\n FN += 1\n score = 1-FN/len(event_true)\n return score\n",
"execution_count": 33,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "precision(event_true, event_pred)",
"execution_count": 34,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 34,
"data": {
"text/plain": "0.9090909090909091"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "recall(event_true, event_pred)",
"execution_count": 35,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 35,
"data": {
"text/plain": "0.0535714285714286"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from problem import overlapWithList, find\nimport datetime\n\ndef precision2(event_true, event_pred):\n FP = [x for x in event_pred\n if max(overlapWithList(x, event_true, percent=True)) < 0.4]\n FP_too_short = [x for x in FP\n if x.duration < datetime.timedelta(hours=2.5)]\n for event in FP_too_short:\n FP.remove(event)\n return FP\n # score = 1-len(FP)/len(event_pred)\n # return score\n\ndef recall2(event_true, event_pred):\n FN = 0\n for event in event_true:\n corresponding = find(event, event_pred, 0.5, 'best')\n if corresponding is None:\n FN += 1\n return FN\n #score = 1-FN/len(event_true)\n #return score\n",
"execution_count": 37,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "FP = precision2(event_true, event_pred)",
"execution_count": 39,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "FN = recall2(event_true, event_pred)",
"execution_count": 38,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "len(FP)",
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 40,
"data": {
"text/plain": "39"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "FN",
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 42,
"data": {
"text/plain": "106"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "n_Pred = len(event_pred)\nn_Pred",
"execution_count": 45,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 45,
"data": {
"text/plain": "429"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "n_True = len(event_true)\nn_True",
"execution_count": 47,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 47,
"data": {
"text/plain": "112"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "* FN + TP = n_True \n* TP + FP = n_Pred"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "TP = n_True - FN\nTP",
"execution_count": 48,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 48,
"data": {
"text/plain": "6"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "TP + len(FP)",
"execution_count": 50,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 50,
"data": {
"text/plain": "45"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "TP = n_Pred - len(FP)\nTP",
"execution_count": 52,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 52,
"data": {
"text/plain": "390"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"toc": {
"navigate_menu": true,
"toc_window_display": false,
"sideBar": true,
"toc_cell": false,
"threshold": 6,
"toc_section_display": "block",
"number_sections": true
},
"language_info": {
"name": "python",
"pygments_lexer": "ipython3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"version": "3.5.5",
"nbconvert_exporter": "python"
},
"nav_menu": {},
"kernelspec": {
"name": "dev",
"display_name": "Python 3 (dev)",
"language": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment