Last active
August 29, 2015 13:59
-
-
Save falcondai/10705851 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "baseline" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import pandas as pd", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "%pylab inline", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Populating the interactive namespace from numpy and matplotlib\n" | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "bs = pd.read_csv('datasets/ds0-battery.csv')", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# only look at blocks with SV images\nbs = bs[bs.image_frequency>0]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from sklearn import neighbors, cross_validation, metrics", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# 5-fold CV\ncv5 = cross_validation.KFold(len(bs), n_folds=5, shuffle=True)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# locations as input variable\nxs = bs[['west', 'north']].values", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# use battery freq as regression target variable\nys = bs.battery_frequency.values", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "bs.battery_frequency.value_counts()", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 24, | |
"text": "0 163\n1 141\n2 113\n3 55\n4 55\n5 43\n6 37\n8 18\n7 14\n9 13\n10 6\n11 6\n13 6\n12 5\n19 2\n18 2\n14 1\n16 1\n23 1\ndtype: int64" | |
} | |
], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "Some baseline regressors on non-image features\n----------------------------------------------\n\nthere are two common metrics to use: MAE and RMSE for regression, they shouldn't make much difference here since none of the baseline methods I considered actually optimize with respect to an error function." | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "predict the mean\n----------------" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "metrics.mean_squared_error(ys, [mean(ys)] * len(ys))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 21, | |
"text": "10.5047105718045" | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "metrics.mean_absolute_error(ys, [mean(ys)] * len(ys))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 22, | |
"text": "2.4005168514202655" | |
} | |
], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "predict all 0\n-------------" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "metrics.mean_squared_error(ys, [0] * len(ys))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"text": "18.587976539589445" | |
} | |
], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "metrics.mean_absolute_error(ys, [0] * len(ys))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 25, | |
"text": "2.8431085043988271" | |
} | |
], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "nearest neighbors\n-----------------\nthis performs the best and the error is already quite small by using location alone. I am anxious to see how much performance image can boost." | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# nearest neighbors\nss = cross_validation.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), xs, ys, cv=cv5, score_func=metrics.mean_absolute_error)\nprint mean(ss), std(ss), ss", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "1.8163374839 0.0911978257407 [ 1.88807786 1.66909976 1.78921569 1.80147059 1.93382353]\n" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:1137: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.\n scoring=scoring\n" | |
} | |
], | |
"prompt_number": 37 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment