falcondai · August 29, 2015 13:59
diff --git a/baseline.ipynb b/baseline.ipynb
 {
 "metadata": {
  "name": "baseline"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "import pandas as pd",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "%pylab inline",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Populating the interactive namespace from numpy and matplotlib\n"
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "bs = pd.read_csv('datasets/ds0-battery.csv')",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "# only look at blocks with SV images\nbs = bs[bs.image_frequency>0]",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from sklearn import neighbors, cross_validation, metrics",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "# 5-fold CV\ncv5 = cross_validation.KFold(len(bs), n_folds=5, shuffle=True)",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "# locations as input variable\nxs = bs[['west', 'north']].values",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "# use battery freq as regression target variable\nys = bs.battery_frequency.values",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "bs.battery_frequency.value_counts()",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 24,
       "text": "0     163\n1     141\n2     113\n3      55\n4      55\n5      43\n6      37\n8      18\n7      14\n9      13\n10      6\n11      6\n13      6\n12      5\n19      2\n18      2\n14      1\n16      1\n23      1\ndtype: int64"
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Some baseline regressors on non-image features\n----------------------------------------------\n\nthere are two common metrics to use: MAE and RMSE for regression, they shouldn't make much difference here since none of the baseline methods I considered actually optimize with respect to an error function."
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "predict the mean\n----------------"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "metrics.mean_squared_error(ys, [mean(ys)] * len(ys))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 21,
       "text": "10.5047105718045"
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "metrics.mean_absolute_error(ys, [mean(ys)] * len(ys))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 22,
       "text": "2.4005168514202655"
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "predict all 0\n-------------"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "metrics.mean_squared_error(ys, [0] * len(ys))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 26,
       "text": "18.587976539589445"
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "metrics.mean_absolute_error(ys, [0] * len(ys))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 25,
       "text": "2.8431085043988271"
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "nearest neighbors\n-----------------\nthis performs the best and the error is already quite small by using location alone. I am anxious to see how much performance image can boost."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "# nearest neighbors\nss = cross_validation.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), xs, ys, cv=cv5, score_func=metrics.mean_absolute_error)\nprint mean(ss), std(ss), ss",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "1.8163374839 0.0911978257407 [ 1.88807786  1.66909976  1.78921569  1.80147059  1.93382353]\n"
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:1137: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.\n  scoring=scoring\n"
      }
     ],
     "prompt_number": 37
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "baseline"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "import pandas as pd",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "%pylab inline",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "Populating the interactive namespace from numpy and matplotlib\n"
	}
	],
	"prompt_number": 18
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "bs = pd.read_csv('datasets/ds0-battery.csv')",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "# only look at blocks with SV images\nbs = bs[bs.image_frequency>0]",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "from sklearn import neighbors, cross_validation, metrics",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "# 5-fold CV\ncv5 = cross_validation.KFold(len(bs), n_folds=5, shuffle=True)",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "# locations as input variable\nxs = bs[['west', 'north']].values",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "# use battery freq as regression target variable\nys = bs.battery_frequency.values",
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 12
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "bs.battery_frequency.value_counts()",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 24,
	"text": "0 163\n1 141\n2 113\n3 55\n4 55\n5 43\n6 37\n8 18\n7 14\n9 13\n10 6\n11 6\n13 6\n12 5\n19 2\n18 2\n14 1\n16 1\n23 1\ndtype: int64"
	}
	],
	"prompt_number": 24
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "Some baseline regressors on non-image features\n----------------------------------------------\n\nthere are two common metrics to use: MAE and RMSE for regression, they shouldn't make much difference here since none of the baseline methods I considered actually optimize with respect to an error function."
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "predict the mean\n----------------"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "metrics.mean_squared_error(ys, [mean(ys)] * len(ys))",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 21,
	"text": "10.5047105718045"
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "metrics.mean_absolute_error(ys, [mean(ys)] * len(ys))",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 22,
	"text": "2.4005168514202655"
	}
	],
	"prompt_number": 22
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "predict all 0\n-------------"
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "metrics.mean_squared_error(ys, [0] * len(ys))",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 26,
	"text": "18.587976539589445"
	}
	],
	"prompt_number": 26
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "metrics.mean_absolute_error(ys, [0] * len(ys))",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 25,
	"text": "2.8431085043988271"
	}
	],
	"prompt_number": 25
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": "nearest neighbors\n-----------------\nthis performs the best and the error is already quite small by using location alone. I am anxious to see how much performance image can boost."
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": "# nearest neighbors\nss = cross_validation.cross_val_score(neighbors.KNeighborsRegressor(n_neighbors=3), xs, ys, cv=cv5, score_func=metrics.mean_absolute_error)\nprint mean(ss), std(ss), ss",
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": "1.8163374839 0.0911978257407 [ 1.88807786 1.66909976 1.78921569 1.80147059 1.93382353]\n"
	},
	{
	"output_type": "stream",
	"stream": "stderr",
	"text": "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:1137: DeprecationWarning: Passing function as ``score_func`` is deprecated and will be removed in 0.15. Either use strings or score objects.The relevant new parameter is called ''scoring''.\n scoring=scoring\n"
	}
	],
	"prompt_number": 37
	}
	],
	"metadata": {}
	}
	]
	}