Skip to content

Instantly share code, notes, and snippets.

@byronyi
Last active August 29, 2015 14:17
Show Gist options
  • Select an option

  • Save byronyi/761dc3fe0e5e283e6c20 to your computer and use it in GitHub Desktop.

Select an option

Save byronyi/761dc3fe0e5e283e6c20 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Logistic Regression\n",
"==="
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"\n",
"pd.set_option('display.max_rows', 500)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.width', 80)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"demo_barrier.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" cost 0.57 0.44 0.50 235\n",
" range 0.59 0.42 0.49 241\n",
" safety 0.09 0.62 0.16 8\n",
" charge 0.05 0.20 0.08 10\n",
" location 0.09 0.38 0.14 13\n",
"\n",
"avg / total 0.55 0.43 0.47 507\n",
"\n",
"\t\tPredicted target\n",
" cost range safety charge location\n",
"cost 104 61 28 19 23\n",
"range 71 101 23 20 26\n",
"safety 2 1 5 0 0\n",
"charge 2 4 0 2 2\n",
"location 3 4 0 1 5\n",
"\n",
"\t\tCoefficients\n",
" cost range safety charge location\n",
"intercept 0.15 -1.94 -5.76 -4.67 -3.86\n",
"male -0.05 -0.87 -4.26 -1.54 -1.83\n",
"female 0.20 -1.07 -1.50 -3.13 -2.03\n",
"age_0 5.27 -3.30 -8.46 -2.93 -5.98\n",
"age_2 -1.22 0.31 -0.20 2.37 0.59\n",
"age_3 -1.03 0.08 0.97 1.53 1.13\n",
"age_4 -1.01 0.31 1.13 -8.87 -0.24\n",
"age_5 -1.87 0.67 0.80 3.22 0.64\n",
"high_b 5.85 -3.97 -7.21 -7.64 -0.29\n",
"high -1.36 0.14 1.13 1.12 -1.68\n",
"bachelor -1.87 0.66 -0.97 1.19 -0.73\n",
"master -2.48 1.24 1.29 0.66 -1.16\n",
"0k -0.44 -0.08 0.06 -0.18 -7.31\n",
"30k 0.34 -0.49 -1.80 -1.86 -7.45\n",
"50k -0.32 -0.14 -1.47 -0.63 3.56\n",
"100k -0.34 -0.27 0.43 -0.16 3.67\n",
"200k 0.90 -0.95 -2.98 -1.84 3.67\n",
"pop_1 0.65 -0.35 -4.72 -8.06 -0.82\n",
"pop_2 0.55 -1.07 5.16 1.13 -0.83\n",
"pop_3 0.06 -0.45 5.51 -0.07 -0.98\n",
"pop_4 0.00 -0.38 -7.04 0.38 -0.11\n",
"pop_5 -1.12 0.31 -4.66 1.95 -1.13\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"demo_range.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 250k 0.49 0.49 0.49 353\n",
" 350k 0.46 0.24 0.32 293\n",
" 450k 0.18 0.21 0.19 86\n",
" 550k 0.07 0.30 0.11 20\n",
" 650k 0.19 0.34 0.24 59\n",
"\n",
"avg / total 0.41 0.36 0.37 811\n",
"\n",
"\t\tPredicted target\n",
" 250k 350k 450k 550k 650k\n",
"250k 174 52 43 38 46\n",
"350k 126 71 30 38 28\n",
"450k 34 20 18 4 10\n",
"550k 6 4 3 6 1\n",
"650k 16 9 8 6 20\n",
"\n",
"\t\tCoefficients\n",
" 250k 350k 450k 550k 650k\n",
"intercept -0.13 -1.75 -2.65 -3.10 -1.89\n",
"male -0.10 -0.84 -1.41 -2.02 -0.63\n",
"female -0.04 -0.91 -1.25 -1.09 -1.26\n",
"age_0 1.06 -4.13 -5.07 -6.43 0.56\n",
"age_2 -0.30 0.78 0.34 1.43 -1.04\n",
"age_3 -0.51 0.74 0.73 0.92 -0.41\n",
"age_4 -0.30 0.55 0.92 0.53 -0.79\n",
"age_5 -0.08 0.32 0.42 0.45 -0.20\n",
"high_b 0.49 -0.49 -5.08 -5.57 -4.31\n",
"high -0.05 -0.67 0.73 1.13 1.03\n",
"bachelor -0.13 -0.40 0.89 0.77 0.39\n",
"master -0.44 -0.19 0.82 0.56 1.00\n",
"0k 0.01 -0.38 -0.95 -0.07 -0.03\n",
"30k 0.11 -0.27 -0.74 -0.69 -0.69\n",
"50k 0.21 -0.42 -0.64 -0.80 -0.78\n",
"100k -0.26 -0.39 -0.26 0.05 -0.09\n",
"200k -0.19 -0.27 -0.06 -1.60 -0.29\n",
"pop_1 5.60 -3.98 -4.63 -7.26 -4.46\n",
"pop_2 -1.42 0.46 1.07 0.35 0.40\n",
"pop_3 -1.29 0.50 0.22 0.87 0.77\n",
"pop_4 -1.31 0.61 0.55 0.69 0.28\n",
"pop_5 -1.72 0.67 0.15 2.24 1.12\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"demo_time.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" min_30 0.27 0.18 0.22 145\n",
" min_20 0.37 0.43 0.40 297\n",
" min_10 0.26 0.25 0.26 161\n",
" min_5 0.17 0.18 0.18 105\n",
" min_0 0.19 0.20 0.19 103\n",
"\n",
"avg / total 0.28 0.29 0.28 811\n",
"\n",
"\t\tPredicted target\n",
" min_30 min_20 min_10 min_5 min_0\n",
"min_30 26 61 28 17 13\n",
"min_20 32 127 56 36 46\n",
"min_10 9 71 40 20 21\n",
"min_5 17 43 14 19 12\n",
"min_0 11 38 14 19 21\n",
"\n",
"\t\tCoefficients\n",
" min_30 min_20 min_10 min_5 min_0\n",
"intercept -1.04 -1.09 -1.04 -1.77 -1.53\n",
"male -0.66 -0.43 -0.54 -0.84 -0.82\n",
"female -0.38 -0.66 -0.50 -0.92 -0.71\n",
"age_0 0.11 -0.63 1.02 -5.25 -5.50\n",
"age_2 -0.06 -0.03 -0.47 0.95 0.48\n",
"age_3 -0.35 -0.15 -0.20 0.76 0.96\n",
"age_4 -0.40 -0.40 -0.51 1.55 1.04\n",
"age_5 -0.33 0.12 -0.88 0.22 1.48\n",
"high_b -4.59 -0.36 0.45 -5.11 0.24\n",
"high 1.42 -0.32 -0.51 1.12 -0.71\n",
"bachelor 1.07 -0.23 -0.15 1.00 -0.79\n",
"master 1.07 -0.19 -0.83 1.22 -0.28\n",
"0k -0.34 -0.27 -0.29 -0.44 0.18\n",
"30k -0.31 -0.25 -0.00 -0.25 -0.49\n",
"50k -0.18 -0.24 -0.09 -0.37 -0.42\n",
"100k -0.27 -0.21 -0.21 -0.44 -0.14\n",
"200k 0.06 -0.11 -0.45 -0.27 -0.66\n",
"pop_1 1.51 -4.24 -4.17 0.47 -5.51\n",
"pop_2 -0.67 0.85 0.77 -0.53 0.97\n",
"pop_3 -0.42 0.94 0.84 -0.86 0.64\n",
"pop_4 -0.81 0.82 1.08 -0.61 0.84\n",
"pop_5 -0.64 0.54 0.45 -0.25 1.52\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"habit_barrier.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" cost 0.63 0.49 0.55 235\n",
" range 0.64 0.42 0.51 241\n",
" safety 0.14 0.88 0.24 8\n",
" charge 0.08 0.50 0.14 10\n",
" location 0.06 0.23 0.09 13\n",
"\n",
"avg / total 0.60 0.46 0.51 507\n",
"\n",
"\t\tPredicted target\n",
" cost range safety charge location\n",
"cost 116 50 20 27 22\n",
"range 63 102 22 27 27\n",
"safety 1 0 7 0 0\n",
"charge 2 3 0 5 0\n",
"location 3 5 2 0 3\n",
"\n",
"\t\tCoefficients\n",
" cost range safety charge location\n",
"intercept 0.96 -1.95 -6.58 -6.14 -5.49\n",
"car_0 0.59 -0.82 -10.74 -1.56 -8.90\n",
"car_1 0.07 -0.50 1.76 -2.48 2.29\n",
"car_2 0.31 -0.63 2.40 -2.09 1.13\n",
"daily 0.27 -0.58 -1.88 0.85 -0.92\n",
"weekly 0.57 -0.63 -2.09 1.45 -2.50\n",
"monthly -0.04 -0.38 -1.69 -9.88 -0.54\n",
"never 0.16 -0.37 -0.91 1.43 -1.54\n",
"leisure -6.02 3.66 -0.27 -6.73 3.12\n",
"commute -5.89 3.12 -2.27 5.32 3.14\n",
"professional -5.56 3.94 -1.01 -4.63 -8.39\n",
"km_0 0.66 -0.54 -2.91 -2.45 5.68\n",
"km_5 -0.02 -0.11 -2.03 0.64 -4.05\n",
"km_10 0.02 -0.23 -14.98 -0.35 -2.94\n",
"km_20 0.59 -1.00 6.11 -1.71 -2.37\n",
"km_30 -0.30 -0.07 7.23 -2.27 -1.81\n",
"min_0 0.02 -0.44 3.04 -10.66 -7.66\n",
"min_15 0.55 -0.88 4.22 -0.03 1.87\n",
"min_30 -0.16 0.21 -6.54 1.66 0.43\n",
"min_60 0.54 -0.84 -7.30 2.88 -0.13\n",
"never.1 0.87 -1.35 0.26 -0.14 1.20\n",
"hv 0.03 -0.60 1.76 -0.01 1.38\n",
"phv 0.00 0.00 0.00 0.00 0.00\n",
"ev 0.06 0.00 -8.60 -5.99 -8.07\n",
"no 0.26 -0.79 0.60 -10.06 1.07\n",
"yr_0 0.43 -0.38 -10.03 0.99 -9.12\n",
"yr_1 0.13 -0.17 0.61 1.04 1.22\n",
"yr_3 0.04 -0.17 0.85 0.64 0.63\n",
"yr_5 0.11 -0.44 1.38 1.25 0.70\n",
"200k 0.65 -1.01 3.27 -1.94 -2.12\n",
"300k 0.56 -0.94 3.05 -1.86 -2.41\n",
"400k -0.25 0.00 -12.91 -2.34 -0.96\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"habit_range.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 250k 0.54 0.50 0.52 353\n",
" 350k 0.51 0.42 0.46 293\n",
" 450k 0.20 0.22 0.21 86\n",
" 550k 0.08 0.30 0.13 20\n",
" 650k 0.18 0.24 0.21 59\n",
"\n",
"avg / total 0.46 0.42 0.43 811\n",
"\n",
"\t\tPredicted target\n",
" 250k 350k 450k 550k 650k\n",
"250k 177 84 34 24 34\n",
"350k 97 124 31 22 19\n",
"450k 30 17 19 13 7\n",
"550k 5 4 3 6 2\n",
"650k 18 14 6 7 14\n",
"\n",
"\t\tCoefficients\n",
" 250k 350k 450k 550k 650k\n",
"intercept -0.68 -0.61 -0.85 -2.61 -1.66\n",
"car_0 -0.35 -0.43 -0.04 -0.25 0.02\n",
"car_1 -0.31 0.03 -0.47 -1.10 -0.74\n",
"car_2 -0.02 -0.21 -0.34 -1.27 -0.94\n",
"daily -0.29 -0.16 0.16 -1.05 -0.15\n",
"weekly 0.11 -0.56 0.01 -0.04 -0.49\n",
"monthly -0.36 -0.03 -0.51 -0.71 -0.02\n",
"never -0.14 0.13 -0.52 -0.82 -1.00\n",
"leisure -0.36 -0.07 -0.26 -1.12 0.01\n",
"commute -0.12 -0.51 -0.10 -1.37 -0.04\n",
"professional -0.20 -0.03 -0.49 -0.12 -1.63\n",
"km_0 -0.61 0.42 -0.81 -1.63 0.52\n",
"km_5 -0.52 0.06 -0.14 -0.71 -0.20\n",
"km_10 0.09 -0.45 -0.19 0.16 -0.23\n",
"km_20 0.09 -0.47 -0.10 0.02 -0.23\n",
"km_30 0.28 -0.18 0.39 -0.45 -1.52\n",
"min_0 0.49 -1.05 0.56 0.26 -0.82\n",
"min_15 -0.10 0.07 -0.26 -1.01 -0.95\n",
"min_30 -0.53 0.39 -0.38 -1.72 -0.34\n",
"min_60 -0.53 -0.02 -0.77 -0.15 0.45\n",
"never.1 0.16 -0.38 -0.39 1.22 2.84\n",
"hv 0.09 -0.30 -0.51 0.60 3.07\n",
"phv -0.31 -0.65 1.05 -6.46 -2.88\n",
"ev -0.62 0.71 -1.00 2.02 -4.68\n",
"no -0.10 -0.34 0.00 1.46 -0.01\n",
"yr_0 -0.02 0.14 -1.14 -8.32 -0.06\n",
"yr_1 -0.31 0.04 -0.04 1.66 -0.49\n",
"yr_3 -0.04 -0.36 0.28 1.32 -0.63\n",
"yr_5 -0.19 -0.10 0.05 1.26 -0.47\n",
"200k 0.20 -0.30 -0.72 -1.24 -0.79\n",
"300k -0.23 -0.13 -0.26 -1.00 -0.67\n",
"400k -0.64 -0.18 0.13 -0.38 -0.20\n",
"green -0.44 0.18 0.09 2.24 -1.04\n",
"cost 0.09 0.10 -0.87 -8.63 -0.83\n",
"style -0.32 -0.89 -0.08 3.78 0.20\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"habit_time.csv\n",
"\n",
" precision recall f1-score support\n",
"\n",
" min_30.1 0.30 0.31 0.30 145\n",
" min_20 0.43 0.45 0.44 297\n",
" min_10 0.34 0.30 0.32 161\n",
" min_5 0.19 0.12 0.15 105\n",
" min_0.1 0.26 0.34 0.29 103\n",
"\n",
"avg / total 0.34 0.34 0.34 811\n",
"\n",
"\t\tPredicted target\n",
" min_30.1 min_20 min_10 min_5 min_0.1\n",
"min_30.1 45 44 21 14 21\n",
"min_20 46 135 47 29 40\n",
"min_10 28 56 48 9 20\n",
"min_5 12 46 15 13 19\n",
"min_0.1 20 34 9 5 35\n",
"\n",
"\t\tCoefficients\n",
" min_30.1 min_20 min_10 min_5 min_0.1\n",
"intercept -0.76 -0.39 -1.75 -0.35 -0.79\n",
"car_0 -0.02 -0.60 -0.52 -0.05 0.10\n",
"car_1 -0.66 0.21 -0.37 0.08 -0.84\n",
"car_2 -0.08 0.00 -0.86 -0.39 -0.06\n",
"daily -0.06 -0.16 -0.45 -0.07 -0.26\n",
"weekly -0.04 -0.07 -0.65 -0.09 -0.14\n",
"monthly -0.36 -0.08 -0.36 -0.13 -0.06\n",
"never -0.30 -0.08 -0.29 -0.06 -0.33\n",
"leisure -0.58 -0.03 -0.58 -0.23 0.07\n",
"commute -0.33 -0.10 -0.25 -0.19 -0.54\n",
"professional 0.15 -0.25 -0.92 0.07 -0.31\n",
"km_0 -0.75 0.72 -1.00 -0.15 0.34\n",
"km_5 -0.03 0.11 -0.35 -0.43 -0.26\n",
"km_10 -0.19 0.10 -0.70 0.12 -0.20\n",
"km_20 0.09 -0.64 -0.13 0.27 -0.26\n",
"km_30 0.12 -0.66 0.43 -0.16 -0.41\n",
"min_0 0.24 -0.64 -0.04 -0.13 -0.45\n",
"min_15 -0.51 -0.18 -0.34 0.18 0.01\n",
"min_30 -0.32 -0.10 -0.05 -0.15 -0.41\n",
"min_60 -0.17 0.54 -1.31 -0.25 0.06\n",
"never.1 1.67 -0.48 1.21 -0.59 1.11\n",
"hv 1.23 -0.38 0.99 -0.71 1.71\n",
"phv -4.48 0.82 -5.20 0.71 -4.10\n",
"ev 0.82 -0.35 1.25 0.23 0.48\n",
"no -0.08 -0.34 -0.42 0.12 0.34\n",
"yr_0 -0.05 -0.02 -0.28 0.25 -1.39\n",
"yr_1 -0.17 0.22 -0.53 -0.35 -0.08\n",
"yr_3 -0.03 -0.12 -0.45 -0.20 0.20\n",
"yr_5 -0.45 -0.12 -0.08 -0.18 0.14\n",
"200k -0.12 -0.11 -0.65 -0.27 -0.20\n",
"300k -0.12 -0.13 -0.52 -0.24 -0.41\n",
"400k -0.53 -0.15 -0.58 0.16 -0.19\n",
"green -0.79 -0.02 2.14 0.03 -0.47\n",
"cost 0.07 -0.17 1.61 -0.01 -0.81\n",
"style -0.05 -0.19 -5.50 -0.37 0.48\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display, Markdown\n",
"\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"dataset = !ls *.csv\n",
"for data in dataset:\n",
" df = pd.read_csv(data)\n",
" feature_names = df.columns[:-5]\n",
" target_names = df.columns[-5:]\n",
" X = np.array(df[feature_names])\n",
" Y = np.array(df[target_names])\n",
" y = np.argwhere(Y == 1)[:, 1]\n",
" \n",
" print data\n",
" print\n",
" model = LogisticRegression(C=1e10, class_weight='auto')\n",
" model.fit(X, y)\n",
" # print model.dual_coef_\n",
" print classification_report(y, model.predict(X), target_names=target_names)\n",
" print '\\t\\tPredicted target'\n",
" print pd.DataFrame(confusion_matrix(y, model.predict(X)), index=target_names, columns=target_names)\n",
" print\n",
" print '\\t\\tCoefficients'\n",
" print pd.DataFrame(np.round(np.c_[model.intercept_, model.coef_].T, 2), \n",
" index=['intercept']+list(feature_names), columns=target_names)\n",
" print\n",
" display(Markdown('---'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment