jason-curtis · August 29, 2015 13:58
diff --git a/Statsmodels Issue 987 - formula support in wls_prediction_std.ipynb b/Statsmodels Issue 987 - formula support in wls_prediction_std.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:d95c278a4f5c235b14b85ab9c6888ff0e0b56e3637c67e326a84f15ed991a89e"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Lack of formula support in wls_prediction_std"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "https://github.com/statsmodels/statsmodels/issues/987\n",
      "\n",
      "Is it possible to generate exog such that wls_prediction_std will accept it? Or is this just a bug/missing feature?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from scipy import stats\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import statsmodels.formula.api as sm\n",
      "\n",
      "# The important one\n",
      "from statsmodels.sandbox.regression.predstd import wls_prediction_std\n",
      "\n",
      "d = pd.DataFrame({'x': np.random.randn(100) })\n",
      "d['y'] = d.x + np.random.randn(100)\n",
      "f = sm.ols('y~x', d).fit()\n",
      "\n",
      "new_exog=pd.DataFrame({'x':[0.5]})"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 42
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# This works, which makes me think that this format for new_exog should be sufficient\n",
      "print f.predict(new_exog)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[ 0.36164212]\n"
       ]
      }
     ],
     "prompt_number": 43
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# ...But this doesn't work\n",
      "print wls_prediction_std(f, new_exog)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "ValueError",
       "evalue": "wrong shape of exog",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
        "\u001b[1;32m<ipython-input-44-723c5af991d8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# ...But this doesn't work\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0mwls_prediction_std\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnew_exog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[1;32m/usr/local/lib/python2.7/dist-packages/statsmodels/sandbox/regression/predstd.pyc\u001b[0m in \u001b[0;36mwls_prediction_std\u001b[1;34m(res, exog, weights, alpha)\u001b[0m\n\u001b[0;32m     79\u001b[0m         \u001b[0mexog\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0matleast_2d\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     80\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcovb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mexog\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 81\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'wrong shape of exog'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     82\u001b[0m         \u001b[0mpredicted\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     83\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
        "\u001b[1;31mValueError\u001b[0m: wrong shape of exog"
       ]
      }
     ],
     "prompt_number": 44
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Workaround 1\n",
      "=====\n",
      "\n",
      "Steal the exog transformation wrapper from model.predict()"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def transform_exog_to_model(fit, exog):\n",
      "    transform=True\n",
      "    self=fit\n",
      "    \n",
      "    # The following is lifted straight from statsmodels.base.model.Results.predict()\n",
      "    if transform and hasattr(self.model, 'formula') and exog is not None:\n",
      "        from patsy import dmatrix\n",
      "        exog = dmatrix(self.model.data.orig_exog.design_info.builder,\n",
      "                       exog)\n",
      "\n",
      "    if exog is not None:\n",
      "        exog = np.asarray(exog)\n",
      "        if exog.ndim == 1 and (self.model.exog.ndim == 1 or\n",
      "                               self.model.exog.shape[1] == 1):\n",
      "            exog = exog[:, None]\n",
      "        exog = np.atleast_2d(exog)  # needed in count model shape[1]\n",
      "    \n",
      "    # end lifted code\n",
      "    return exog"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print new_exog"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "     x\n",
        "0  0.5\n",
        "\n",
        "[1 rows x 1 columns]\n"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "transformed_exog = transform_exog_to_model(f, new_exog)\n",
      "print transformed_exog"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[ 1.   0.5]]\n"
       ]
      }
     ],
     "prompt_number": 47
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print wls_prediction_std(f, transformed_exog, weights=[1])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(array([ 0.99865096]), array([-1.62014821]), array([ 2.34343245]))\n"
       ]
      }
     ],
     "prompt_number": 49
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Better fix\n",
      "====\n",
      "\n",
      "wls_prediction_std should actually use Result.predict"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def wls_prediction_std(res, exog=None, weights=None, alpha=0.05):\n",
      "    covb = res.cov_params()\n",
      "    if exog is None:\n",
      "        exog = res.model.exog\n",
      "        predicted = res.fittedvalues\n",
      "        if weights is None:\n",
      "            weights = res.model.weights\n",
      "    else:\n",
      "        # Using res.predict() is less hacky than using res.model.predict()\n",
      "        predicted = res.predict(exog)\n",
      "\n",
      "        if covb.shape[1] != exog.shape[1]:\n",
      "            # raise ValueError('wrong shape of exog') # Nope\n",
      "            # Instead of panicking/raising ValueError in this case, transform \n",
      "            # the exog appropriately so that the interval calculations below can\n",
      "            # function\n",
      "            exog = transform_exog_to_model(res, exog)\n",
      "\n",
      "        if weights is None:\n",
      "            weights = 1.\n",
      "        else:\n",
      "            weights = np.asarray(weights)\n",
      "            if len(weights) > 1 and len(weights) != exog.shape[0]:\n",
      "                raise ValueError('weights and exog do not have matching shape')\n",
      "\n",
      "    # The rest of this is unaltered\n",
      "\n",
      "    if weights is None:\n",
      "        weights = res.model.weights\n",
      "\n",
      "\n",
      "    # full covariance:\n",
      "    #predvar = res3.mse_resid + np.diag(np.dot(X2,np.dot(covb,X2.T)))\n",
      "    # predication variance only\n",
      "    predvar = res.mse_resid/weights + (exog * np.dot(covb, exog.T).T).sum(1)\n",
      "    predstd = np.sqrt(predvar)\n",
      "    tppf = stats.t.isf(alpha/2., res.df_resid)\n",
      "    interval_u = predicted + tppf * predstd\n",
      "    interval_l = predicted - tppf * predstd\n",
      "    return predstd, interval_l, interval_u"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 55
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# And now, does this work?\n",
      "print wls_prediction_std(f, new_exog)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(array([ 0.99865096]), array([-1.62014821]), array([ 2.34343245]))\n"
       ]
      }
     ],
     "prompt_number": 58
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "",
	"signature": "sha256:d95c278a4f5c235b14b85ab9c6888ff0e0b56e3637c67e326a84f15ed991a89e"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Lack of formula support in wls_prediction_std"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"https://github.com/statsmodels/statsmodels/issues/987\n",
	"\n",
	"Is it possible to generate exog such that wls_prediction_std will accept it? Or is this just a bug/missing feature?"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from scipy import stats\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import statsmodels.formula.api as sm\n",
	"\n",
	"# The important one\n",
	"from statsmodels.sandbox.regression.predstd import wls_prediction_std\n",
	"\n",
	"d = pd.DataFrame({'x': np.random.randn(100) })\n",
	"d['y'] = d.x + np.random.randn(100)\n",
	"f = sm.ols('y~x', d).fit()\n",
	"\n",
	"new_exog=pd.DataFrame({'x':[0.5]})"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 42
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# This works, which makes me think that this format for new_exog should be sufficient\n",
	"print f.predict(new_exog)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[ 0.36164212]\n"
	]
	}
	],
	"prompt_number": 43
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# ...But this doesn't work\n",
	"print wls_prediction_std(f, new_exog)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"ename": "ValueError",
	"evalue": "wrong shape of exog",
	"output_type": "pyerr",
	"traceback": [
	"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
	"\u001b[1;32m<ipython-input-44-723c5af991d8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# ...But this doesn't work\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0mwls_prediction_std\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnew_exog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[1;32m/usr/local/lib/python2.7/dist-packages/statsmodels/sandbox/regression/predstd.pyc\u001b[0m in \u001b[0;36mwls_prediction_std\u001b[1;34m(res, exog, weights, alpha)\u001b[0m\n\u001b[0;32m 79\u001b[0m \u001b[0mexog\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0matleast_2d\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcovb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mexog\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 81\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'wrong shape of exog'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 82\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexog\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 83\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
	"\u001b[1;31mValueError\u001b[0m: wrong shape of exog"
	]
	}
	],
	"prompt_number": 44
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Workaround 1\n",
	"=====\n",
	"\n",
	"Steal the exog transformation wrapper from model.predict()"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def transform_exog_to_model(fit, exog):\n",
	" transform=True\n",
	" self=fit\n",
	" \n",
	" # The following is lifted straight from statsmodels.base.model.Results.predict()\n",
	" if transform and hasattr(self.model, 'formula') and exog is not None:\n",
	" from patsy import dmatrix\n",
	" exog = dmatrix(self.model.data.orig_exog.design_info.builder,\n",
	" exog)\n",
	"\n",
	" if exog is not None:\n",
	" exog = np.asarray(exog)\n",
	" if exog.ndim == 1 and (self.model.exog.ndim == 1 or\n",
	" self.model.exog.shape[1] == 1):\n",
	" exog = exog[:, None]\n",
	" exog = np.atleast_2d(exog) # needed in count model shape[1]\n",
	" \n",
	" # end lifted code\n",
	" return exog"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 45
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print new_exog"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" x\n",
	"0 0.5\n",
	"\n",
	"[1 rows x 1 columns]\n"
	]
	}
	],
	"prompt_number": 46
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"transformed_exog = transform_exog_to_model(f, new_exog)\n",
	"print transformed_exog"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[[ 1. 0.5]]\n"
	]
	}
	],
	"prompt_number": 47
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"print wls_prediction_std(f, transformed_exog, weights=[1])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"(array([ 0.99865096]), array([-1.62014821]), array([ 2.34343245]))\n"
	]
	}
	],
	"prompt_number": 49
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Better fix\n",
	"====\n",
	"\n",
	"wls_prediction_std should actually use Result.predict"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def wls_prediction_std(res, exog=None, weights=None, alpha=0.05):\n",
	" covb = res.cov_params()\n",
	" if exog is None:\n",
	" exog = res.model.exog\n",
	" predicted = res.fittedvalues\n",
	" if weights is None:\n",
	" weights = res.model.weights\n",
	" else:\n",
	" # Using res.predict() is less hacky than using res.model.predict()\n",
	" predicted = res.predict(exog)\n",
	"\n",
	" if covb.shape[1] != exog.shape[1]:\n",
	" # raise ValueError('wrong shape of exog') # Nope\n",
	" # Instead of panicking/raising ValueError in this case, transform \n",
	" # the exog appropriately so that the interval calculations below can\n",
	" # function\n",
	" exog = transform_exog_to_model(res, exog)\n",
	"\n",
	" if weights is None:\n",
	" weights = 1.\n",
	" else:\n",
	" weights = np.asarray(weights)\n",
	" if len(weights) > 1 and len(weights) != exog.shape[0]:\n",
	" raise ValueError('weights and exog do not have matching shape')\n",
	"\n",
	" # The rest of this is unaltered\n",
	"\n",
	" if weights is None:\n",
	" weights = res.model.weights\n",
	"\n",
	"\n",
	" # full covariance:\n",
	" #predvar = res3.mse_resid + np.diag(np.dot(X2,np.dot(covb,X2.T)))\n",
	" # predication variance only\n",
	" predvar = res.mse_resid/weights + (exog * np.dot(covb, exog.T).T).sum(1)\n",
	" predstd = np.sqrt(predvar)\n",
	" tppf = stats.t.isf(alpha/2., res.df_resid)\n",
	" interval_u = predicted + tppf * predstd\n",
	" interval_l = predicted - tppf * predstd\n",
	" return predstd, interval_l, interval_u"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 55
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# And now, does this work?\n",
	"print wls_prediction_std(f, new_exog)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"(array([ 0.99865096]), array([-1.62014821]), array([ 2.34343245]))\n"
	]
	}
	],
	"prompt_number": 58
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}