akelleh · March 1, 2018 17:57
diff --git a/instrumental_variables_demo.ipynb b/instrumental_variables_demo.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "from statsmodels.sandbox.regression.gmm import IV2SLS \n",
    "from statsmodels.api import OLS, Logit\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 5000\n",
    "\n",
    "u = np.random.normal(size=N)\n",
    "\n",
    "z = np.random.normal(size=N)\n",
    "\n",
    "p_d = 1. / (1. + np.exp(-(u+z)))\n",
    "d = np.random.binomial(1, p=p_d)\n",
    "\n",
    "y0 = np.random.normal(size=N)\n",
    "y1 = np.random.normal(u)\n",
    "y = (d==1)* y1 + (d==0) * y0\n",
    "X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['intercept'] = 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.34807908945587035"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[X['d'] == 1]['y'].mean() - X[X['d'] == 0]['y'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.007102631189169498"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(X['y1'] - X['y0']).mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Let's make sure our instrument is reasonably strong..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>z</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.253473</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>0.253473</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          z         d\n",
       "z  1.000000  0.253473\n",
       "d  0.253473  1.000000"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[['z', 'd']].corr() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### First, let's do it in two stages manually, so we can see the process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.614431\n",
      "         Iterations 5\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.000</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>  -0.000</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>  0.4417</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Thu, 01 Mar 2018</td> <th>  Prob (F-statistic):</th>  <td> 0.506</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>12:40:30</td>     <th>  Log-Likelihood:    </th> <td> -8034.4</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>  5000</td>      <th>  AIC:               </th> <td>1.607e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>  4998</td>      <th>  BIC:               </th> <td>1.609e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     1</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "       <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>d_expected</th> <td>    0.0591</td> <td>    0.089</td> <td>    0.665</td> <td> 0.506</td> <td>   -0.115</td> <td>    0.233</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>intercept</th>  <td>    0.1666</td> <td>    0.048</td> <td>    3.493</td> <td> 0.000</td> <td>    0.073</td> <td>    0.260</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>14.783</td> <th>  Durbin-Watson:     </th> <td>   2.046</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.001</td> <th>  Jarque-Bera (JB):  </th> <td>  15.132</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.115</td> <th>  Prob(JB):          </th> <td>0.000518</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.142</td> <th>  Cond. No.          </th> <td>    6.56</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                      y   R-squared:                       0.000\n",
       "Model:                            OLS   Adj. R-squared:                 -0.000\n",
       "Method:                 Least Squares   F-statistic:                    0.4417\n",
       "Date:                Thu, 01 Mar 2018   Prob (F-statistic):              0.506\n",
       "Time:                        12:40:30   Log-Likelihood:                -8034.4\n",
       "No. Observations:                5000   AIC:                         1.607e+04\n",
       "Df Residuals:                    4998   BIC:                         1.609e+04\n",
       "Df Model:                           1                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "d_expected     0.0591      0.089      0.665      0.506      -0.115       0.233\n",
       "intercept      0.1666      0.048      3.493      0.000       0.073       0.260\n",
       "==============================================================================\n",
       "Omnibus:                       14.783   Durbin-Watson:                   2.046\n",
       "Prob(Omnibus):                  0.001   Jarque-Bera (JB):               15.132\n",
       "Skew:                           0.115   Prob(JB):                     0.000518\n",
       "Kurtosis:                       3.142   Cond. No.                         6.56\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "instrument_model = Logit(X['d'], X[['z', 'intercept']])\n",
    "instrument_result = instrument_model.fit()\n",
    "\n",
    "X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])\n",
    "causal_model = OLS(X['y'], X[['d_expected', 'intercept']])\n",
    "result = causal_model.fit()\n",
    "result.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Now, using statsmodel's implementation for 2sls. \n",
    "Note: their result.summary method is broken!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>-0.084862</td>\n",
       "      <td>0.08305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>intercept</th>\n",
       "      <td>0.134165</td>\n",
       "      <td>0.22065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0        1\n",
       "d         -0.084862  0.08305\n",
       "intercept  0.134165  0.22065"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])\n",
    "result = model.fit()\n",
    "result.conf_int()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### This all works even if Z doesn't cause D! As long as they're associated (and D doesn't cause Z) then you're okay!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 5000\n",
    "\n",
    "u = np.random.normal(size=N)\n",
    "\n",
    "uz = np.random.normal(size=N)\n",
    "\n",
    "z = np.random.normal(uz, size=N)\n",
    "\n",
    "p_d = 1. / (1. + np.exp(-(u+uz)))\n",
    "d = np.random.binomial(1, p=p_d)\n",
    "\n",
    "y0 = np.random.normal(size=N)\n",
    "y1 = np.random.normal(u)\n",
    "y = (d==1)* y1 + (d==0) * y0\n",
    "X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Let's make sure the instrument is reasonably strong!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>z</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.260994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>0.260994</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          z         d\n",
       "z  1.000000  0.260994\n",
       "d  0.260994  1.000000"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[['z', 'd']].corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['intercept'] = 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>-0.294580</td>\n",
       "      <td>0.209399</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>intercept</th>\n",
       "      <td>0.060953</td>\n",
       "      <td>0.318652</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0         1\n",
       "d         -0.294580  0.209399\n",
       "intercept  0.060953  0.318652"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])\n",
    "result = model.fit()\n",
    "result.conf_int()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Looks good!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### What if the assumption that Z only causes Y through D is violated?\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 5000\n",
    "\n",
    "u = np.random.normal(size=N)\n",
    "\n",
    "z = np.random.normal(size=N)\n",
    "\n",
    "a = z + np.random.normal(size=N)\n",
    "\n",
    "p_d = 1. / (1. + np.exp(-(u+z)))\n",
    "d = np.random.binomial(1, p=p_d)\n",
    "\n",
    "y0 = np.random.normal(size=N)\n",
    "y1 = a + np.random.normal(u)\n",
    "y = (d==1)* y1 + (d==0) * y0\n",
    "X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'a' :a, 'y0': y0, 'y1': y1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "      <th>y</th>\n",
       "      <th>y0</th>\n",
       "      <th>y1</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.255894</td>\n",
       "      <td>0.474881</td>\n",
       "      <td>-0.021884</td>\n",
       "      <td>0.711883</td>\n",
       "      <td>0.704982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>0.255894</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.245038</td>\n",
       "      <td>-0.014144</td>\n",
       "      <td>0.373323</td>\n",
       "      <td>0.363579</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>0.474881</td>\n",
       "      <td>0.245038</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.308260</td>\n",
       "      <td>0.656390</td>\n",
       "      <td>0.342872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y0</th>\n",
       "      <td>-0.021884</td>\n",
       "      <td>-0.014144</td>\n",
       "      <td>0.308260</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.011003</td>\n",
       "      <td>-0.031003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y1</th>\n",
       "      <td>0.711883</td>\n",
       "      <td>0.373323</td>\n",
       "      <td>0.656390</td>\n",
       "      <td>-0.011003</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.514429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>0.704982</td>\n",
       "      <td>0.363579</td>\n",
       "      <td>0.342872</td>\n",
       "      <td>-0.031003</td>\n",
       "      <td>0.514429</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           a         d         y        y0        y1         z\n",
       "a   1.000000  0.255894  0.474881 -0.021884  0.711883  0.704982\n",
       "d   0.255894  1.000000  0.245038 -0.014144  0.373323  0.363579\n",
       "y   0.474881  0.245038  1.000000  0.308260  0.656390  0.342872\n",
       "y0 -0.021884 -0.014144  0.308260  1.000000 -0.011003 -0.031003\n",
       "y1  0.711883  0.373323  0.656390 -0.011003  1.000000  0.514429\n",
       "z   0.704982  0.363579  0.342872 -0.031003  0.514429  1.000000"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "X['intercept'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>2.618198</td>\n",
       "      <td>3.178242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>intercept</th>\n",
       "      <td>-1.168116</td>\n",
       "      <td>-0.875108</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0         1\n",
       "d          2.618198  3.178242\n",
       "intercept -1.168116 -0.875108"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])\n",
    "result = model.fit()\n",
    "result.conf_int()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lot's of bias!!!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can fix it!!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>-0.152977</td>\n",
       "      <td>0.424428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>intercept</th>\n",
       "      <td>0.178235</td>\n",
       "      <td>0.470210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>0.470047</td>\n",
       "      <td>0.544819</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0         1\n",
       "d         -0.152977  0.424428\n",
       "intercept  0.178235  0.470210\n",
       "a          0.470047  0.544819"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# here, we have to include 'a' in the instrument definition as well, \n",
    "# even though it's a control variable for the y regression\n",
    "model = IV2SLS(X['y'], X[['d', 'intercept', 'a']], \n",
    "               instrument=X[['z', 'intercept', 'a']])\n",
    "result = model.fit()\n",
    "result.conf_int()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.622092\n",
      "         Iterations 5\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.226</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.225</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   727.9</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Thu, 01 Mar 2018</td> <th>  Prob (F-statistic):</th> <td>3.63e-278</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>12:56:02</td>     <th>  Log-Likelihood:    </th> <td> -8602.5</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>  5000</td>      <th>  AIC:               </th> <td>1.721e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>  4997</td>      <th>  BIC:               </th> <td>1.723e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     2</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "       <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>d_expected</th> <td>    0.1193</td> <td>    0.147</td> <td>    0.812</td> <td> 0.417</td> <td>   -0.169</td> <td>    0.407</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>a</th>          <td>    0.5089</td> <td>    0.019</td> <td>   26.639</td> <td> 0.000</td> <td>    0.471</td> <td>    0.546</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>intercept</th>  <td>    0.3323</td> <td>    0.074</td> <td>    4.473</td> <td> 0.000</td> <td>    0.187</td> <td>    0.478</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>12.644</td> <th>  Durbin-Watson:     </th> <td>   2.056</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.002</td> <th>  Jarque-Bera (JB):  </th> <td>  12.644</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.121</td> <th>  Prob(JB):          </th> <td> 0.00180</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.045</td> <th>  Cond. No.          </th> <td>    12.1</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                      y   R-squared:                       0.226\n",
       "Model:                            OLS   Adj. R-squared:                  0.225\n",
       "Method:                 Least Squares   F-statistic:                     727.9\n",
       "Date:                Thu, 01 Mar 2018   Prob (F-statistic):          3.63e-278\n",
       "Time:                        12:56:02   Log-Likelihood:                -8602.5\n",
       "No. Observations:                5000   AIC:                         1.721e+04\n",
       "Df Residuals:                    4997   BIC:                         1.723e+04\n",
       "Df Model:                           2                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "d_expected     0.1193      0.147      0.812      0.417      -0.169       0.407\n",
       "a              0.5089      0.019     26.639      0.000       0.471       0.546\n",
       "intercept      0.3323      0.074      4.473      0.000       0.187       0.478\n",
       "==============================================================================\n",
       "Omnibus:                       12.644   Durbin-Watson:                   2.056\n",
       "Prob(Omnibus):                  0.002   Jarque-Bera (JB):               12.644\n",
       "Skew:                           0.121   Prob(JB):                      0.00180\n",
       "Kurtosis:                       3.045   Cond. No.                         12.1\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "instrument_model = Logit(X['d'], X[['z', 'intercept']])\n",
    "instrument_result = instrument_model.fit()\n",
    "\n",
    "X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])\n",
    "causal_model = OLS(X['y'], X[['d_expected', 'a', 'intercept']])\n",
    "result = causal_model.fit()\n",
    "result.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }