dcalacci · August 29, 2015 14:04
diff --git a/[pubspeech] - time_series_correlation b/[pubspeech] - time_series_correlation
 {
 "metadata": {
  "name": "",
  "signature": "sha256:ba1951a9424c0d8948a247be8ff6ed159e05dc19bdaaf59eaacb0f7e7f66ca45"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "import seaborn as sns\n",
      "import pickle"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 60
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# add project-specific src directories\n",
      "import sys\n",
      "sys.path.insert(2, '/home/dcalacci/lazerlab/congressional-public-speech/dan/src')\n",
      "# project imports\n",
      "from analysis import authors\n",
      "import analysis\n",
      "import datacleaning"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_lambda = np.loadtxt('../../../data/models/combined/lambda-2270.dat')\n",
      "vocab = datacleaning.load_and_filter_vocab('../../../lexicons/vocab_general.txt')[:10000]\n",
      "lda = analysis.lda.LDA(all_lambda, vocab)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "sentiment_df = pd.read_csv('../../../data/individual_2010-2013_pr-st_sentiment.csv', sep=\"\\t\", index_col=0,\n",
      "                 parse_dates=True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = pd.read_csv('../../../data/models/combined/document_distributions.csv', \n",
      "                            sep='\\t', \n",
      "                            index_col=0,\n",
      "                            parse_dates=['date'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 40
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>date</th>\n",
        "      <th>author</th>\n",
        "      <th>party</th>\n",
        "      <th>distribution</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>id</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>826938</th>\n",
        "      <td>2013-11-14</td>\n",
        "      <td> 3457</td>\n",
        "      <td> Democrat</td>\n",
        "      <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.00735294117647 0.022...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>826935</th>\n",
        "      <td>2013-11-11</td>\n",
        "      <td> 3457</td>\n",
        "      <td> Democrat</td>\n",
        "      <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.00613496932515 0.012...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>823397</th>\n",
        "      <td>2013-10-24</td>\n",
        "      <td> 3457</td>\n",
        "      <td> Democrat</td>\n",
        "      <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015873015873...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>823400</th>\n",
        "      <td>2013-10-23</td>\n",
        "      <td> 3457</td>\n",
        "      <td> Democrat</td>\n",
        "      <td> 0.0 0.00333333333333 0.0 0.0 0.0 0.0 0.0066666...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>823399</th>\n",
        "      <td>2013-10-23</td>\n",
        "      <td> 3457</td>\n",
        "      <td> Democrat</td>\n",
        "      <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.0169491525424 0.0169...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 41,
       "text": [
        "             date  author     party  \\\n",
        "id                                    \n",
        "826938 2013-11-14    3457  Democrat   \n",
        "826935 2013-11-11    3457  Democrat   \n",
        "823397 2013-10-24    3457  Democrat   \n",
        "823400 2013-10-23    3457  Democrat   \n",
        "823399 2013-10-23    3457  Democrat   \n",
        "\n",
        "                                             distribution  \n",
        "id                                                         \n",
        "826938  0.0 0.0 0.0 0.0 0.0 0.0 0.00735294117647 0.022...  \n",
        "826935  0.0 0.0 0.0 0.0 0.0 0.0 0.00613496932515 0.012...  \n",
        "823397  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015873015873...  \n",
        "823400  0.0 0.00333333333333 0.0 0.0 0.0 0.0 0.0066666...  \n",
        "823399  0.0 0.0 0.0 0.0 0.0 0.0 0.0169491525424 0.0169...  \n",
        "\n",
        "[5 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 41
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---\n",
      "## Data cleaning / setup for sentiment"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make the distribution a list\n",
      "df['distribution'] = df['distribution'].apply(lambda l: l.split())\n",
      "\n",
      "# join up the sentiment and distributions into one dataframe\n",
      "df = df.join(sentiment_df)\n",
      "\n",
      "df.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>date</th>\n",
        "      <th>author</th>\n",
        "      <th>party</th>\n",
        "      <th>distribution</th>\n",
        "      <th>neg</th>\n",
        "      <th>pos</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>id</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>337536</th>\n",
        "      <td>2012-01-01</td>\n",
        "      <td> 103482</td>\n",
        "      <td> Republican</td>\n",
        "      <td> [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....</td>\n",
        "      <td> 0.007203</td>\n",
        "      <td> 0.042017</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>376979</th>\n",
        "      <td>2012-01-01</td>\n",
        "      <td>    142</td>\n",
        "      <td>   Democrat</td>\n",
        "      <td> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...</td>\n",
        "      <td> 0.017751</td>\n",
        "      <td> 0.088757</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>2 rows \u00d7 6 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 42,
       "text": [
        "             date  author       party  \\\n",
        "id                                      \n",
        "337536 2012-01-01  103482  Republican   \n",
        "376979 2012-01-01     142    Democrat   \n",
        "\n",
        "                                             distribution       neg       pos  \n",
        "id                                                                             \n",
        "337536  [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....  0.007203  0.042017  \n",
        "376979  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...  0.017751  0.088757  \n",
        "\n",
        "[2 rows x 6 columns]"
       ]
      }
     ],
     "prompt_number": 42
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# set the index to be the date, rather than the ID.\n",
      "df = df.reset_index()\n",
      "df = df.set_index('date')\n",
      "\n",
      "# keep only democrats/republicans\n",
      "df = df[df['party'].isin(['Democrat', 'Republican'])]\n",
      "\n",
      "df.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>author</th>\n",
        "      <th>party</th>\n",
        "      <th>distribution</th>\n",
        "      <th>neg</th>\n",
        "      <th>pos</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>date</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 337536</td>\n",
        "      <td> 103482</td>\n",
        "      <td> Republican</td>\n",
        "      <td> [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....</td>\n",
        "      <td> 0.007203</td>\n",
        "      <td> 0.042017</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 376979</td>\n",
        "      <td>    142</td>\n",
        "      <td>   Democrat</td>\n",
        "      <td> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...</td>\n",
        "      <td> 0.017751</td>\n",
        "      <td> 0.088757</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>2 rows \u00d7 6 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 113,
       "text": [
        "                id  author       party  \\\n",
        "date                                     \n",
        "2012-01-01  337536  103482  Republican   \n",
        "2012-01-01  376979     142    Democrat   \n",
        "\n",
        "                                                 distribution       neg  \\\n",
        "date                                                                      \n",
        "2012-01-01  [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....  0.007203   \n",
        "2012-01-01  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...  0.017751   \n",
        "\n",
        "                 pos  \n",
        "date                  \n",
        "2012-01-01  0.042017  \n",
        "2012-01-01  0.088757  \n",
        "\n",
        "[2 rows x 6 columns]"
       ]
      }
     ],
     "prompt_number": 113
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make a new dataframe that has a multi-index for columns.\n",
      "# we'll have 'attrs', 'sentiment', and 'distribution'\n",
      "# index will still be the document ID.\n",
      "new_df = pd.concat([df[[\"id\", \"party\"]], \n",
      "                    df[[\"neg\", \"pos\"]],\n",
      "                    pd.DataFrame(df.distribution.tolist(), index=df.index)], \n",
      "                    axis=1, keys=[\"attrs\", \"sentiment\", \"distribution\"])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 114
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "new_df.head(2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th></th>\n",
        "      <th colspan=\"2\" halign=\"left\">attrs</th>\n",
        "      <th colspan=\"2\" halign=\"left\">sentiment</th>\n",
        "      <th colspan=\"16\" halign=\"left\">distribution</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>party</th>\n",
        "      <th>neg</th>\n",
        "      <th>pos</th>\n",
        "      <th>0</th>\n",
        "      <th>1</th>\n",
        "      <th>2</th>\n",
        "      <th>3</th>\n",
        "      <th>4</th>\n",
        "      <th>5</th>\n",
        "      <th>6</th>\n",
        "      <th>7</th>\n",
        "      <th>8</th>\n",
        "      <th>9</th>\n",
        "      <th>10</th>\n",
        "      <th>11</th>\n",
        "      <th>12</th>\n",
        "      <th>13</th>\n",
        "      <th>14</th>\n",
        "      <th>15</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>date</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 337536</td>\n",
        "      <td> Republican</td>\n",
        "      <td> 0.007203</td>\n",
        "      <td> 0.042017</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.00579150579151</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.015444015444</td>\n",
        "      <td> 0.0328185328185</td>\n",
        "      <td>  0.0289575289575</td>\n",
        "      <td> 0.00772200772201</td>\n",
        "      <td> 0.0366795366795</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0019305019305</td>\n",
        "      <td> 0.003861003861</td>\n",
        "      <td> 0.00772200772201</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 376979</td>\n",
        "      <td>   Democrat</td>\n",
        "      <td> 0.017751</td>\n",
        "      <td> 0.088757</td>\n",
        "      <td> 0.0</td>\n",
        "      <td>              0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td>            0.0</td>\n",
        "      <td> 0.0247933884298</td>\n",
        "      <td> 0.00826446280992</td>\n",
        "      <td>              0.0</td>\n",
        "      <td> 0.0413223140496</td>\n",
        "      <td> 0.0</td>\n",
        "      <td> 0.0</td>\n",
        "      <td>             0.0</td>\n",
        "      <td>            0.0</td>\n",
        "      <td> 0.00826446280992</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>2 rows \u00d7 104 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 115,
       "text": [
        "             attrs       attrs  sentiment           distribution  \\\n",
        "                id       party        neg       pos            0   \n",
        "date                                                               \n",
        "2012-01-01  337536  Republican   0.007203  0.042017          0.0   \n",
        "2012-01-01  376979    Democrat   0.017751  0.088757          0.0   \n",
        "\n",
        "                                                                  \\\n",
        "                           1    2    3    4    5               6   \n",
        "date                                                               \n",
        "2012-01-01  0.00579150579151  0.0  0.0  0.0  0.0  0.015444015444   \n",
        "2012-01-01               0.0  0.0  0.0  0.0  0.0             0.0   \n",
        "\n",
        "                                                                 \\\n",
        "                          7                 8                 9   \n",
        "date                                                              \n",
        "2012-01-01  0.0328185328185   0.0289575289575  0.00772200772201   \n",
        "2012-01-01  0.0247933884298  0.00826446280992               0.0   \n",
        "\n",
        "                                                                        \\\n",
        "                         10   11   12               13              14   \n",
        "date                                                                     \n",
        "2012-01-01  0.0366795366795  0.0  0.0  0.0019305019305  0.003861003861   \n",
        "2012-01-01  0.0413223140496  0.0  0.0              0.0             0.0   \n",
        "\n",
        "                                  \n",
        "                          15      \n",
        "date                              \n",
        "2012-01-01  0.00772200772201 ...  \n",
        "2012-01-01  0.00826446280992 ...  \n",
        "\n",
        "[2 rows x 104 columns]"
       ]
      }
     ],
     "prompt_number": 115
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we want a new dataframe that looks like this:\n",
      "# 0  1  2\n",
      "# 72 68 59\n",
      "# 23 68 21\n",
      "# where column 0 is the top topic for that document, column 1 is the 2nd most prevalent, etc.\n",
      "ordered = []\n",
      "for n, row in new_df.iterrows():\n",
      "    ordered.append(analysis.lda.similarity.get_topic_range(row.distribution, \n",
      "                                                           (0,len(row.distribution))))\n",
      "    \n",
      "# create dataframe, make the indices the same\n",
      "ordered_topics_df = pd.DataFrame(ordered).set_index(new_df.index)\n",
      "\n",
      "# swap the old 'distribution' dataframe out with this one\n",
      "new_df['distribution'] = ordered_topics_df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 116
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "new_df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th></th>\n",
        "      <th colspan=\"2\" halign=\"left\">attrs</th>\n",
        "      <th colspan=\"2\" halign=\"left\">sentiment</th>\n",
        "      <th colspan=\"16\" halign=\"left\">distribution</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th></th>\n",
        "      <th>id</th>\n",
        "      <th>party</th>\n",
        "      <th>neg</th>\n",
        "      <th>pos</th>\n",
        "      <th>0</th>\n",
        "      <th>1</th>\n",
        "      <th>2</th>\n",
        "      <th>3</th>\n",
        "      <th>4</th>\n",
        "      <th>5</th>\n",
        "      <th>6</th>\n",
        "      <th>7</th>\n",
        "      <th>8</th>\n",
        "      <th>9</th>\n",
        "      <th>10</th>\n",
        "      <th>11</th>\n",
        "      <th>12</th>\n",
        "      <th>13</th>\n",
        "      <th>14</th>\n",
        "      <th>15</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>date</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 337536</td>\n",
        "      <td> Republican</td>\n",
        "      <td> 0.007203</td>\n",
        "      <td> 0.042017</td>\n",
        "      <td> 72</td>\n",
        "      <td> 68</td>\n",
        "      <td> 59</td>\n",
        "      <td> 82</td>\n",
        "      <td> 30</td>\n",
        "      <td> 10</td>\n",
        "      <td> 34</td>\n",
        "      <td> 23</td>\n",
        "      <td>  7</td>\n",
        "      <td> 89</td>\n",
        "      <td> 52</td>\n",
        "      <td> 88</td>\n",
        "      <td>  8</td>\n",
        "      <td> 77</td>\n",
        "      <td> 92</td>\n",
        "      <td> 61</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2012-01-01</th>\n",
        "      <td> 376979</td>\n",
        "      <td>   Democrat</td>\n",
        "      <td> 0.017751</td>\n",
        "      <td> 0.088757</td>\n",
        "      <td> 23</td>\n",
        "      <td> 68</td>\n",
        "      <td> 51</td>\n",
        "      <td> 82</td>\n",
        "      <td> 88</td>\n",
        "      <td> 71</td>\n",
        "      <td> 10</td>\n",
        "      <td> 44</td>\n",
        "      <td> 47</td>\n",
        "      <td> 52</td>\n",
        "      <td> 59</td>\n",
        "      <td> 16</td>\n",
        "      <td> 72</td>\n",
        "      <td> 80</td>\n",
        "      <td> 89</td>\n",
        "      <td> 91</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2010-01-04</th>\n",
        "      <td> 476373</td>\n",
        "      <td> Republican</td>\n",
        "      <td> 0.025078</td>\n",
        "      <td> 0.040752</td>\n",
        "      <td> 59</td>\n",
        "      <td> 80</td>\n",
        "      <td> 68</td>\n",
        "      <td> 51</td>\n",
        "      <td> 72</td>\n",
        "      <td> 78</td>\n",
        "      <td> 30</td>\n",
        "      <td> 10</td>\n",
        "      <td> 23</td>\n",
        "      <td> 88</td>\n",
        "      <td>  8</td>\n",
        "      <td> 92</td>\n",
        "      <td> 44</td>\n",
        "      <td> 58</td>\n",
        "      <td> 71</td>\n",
        "      <td> 82</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2010-01-04</th>\n",
        "      <td> 476393</td>\n",
        "      <td> Republican</td>\n",
        "      <td> 0.009434</td>\n",
        "      <td> 0.015723</td>\n",
        "      <td> 72</td>\n",
        "      <td> 31</td>\n",
        "      <td> 59</td>\n",
        "      <td> 88</td>\n",
        "      <td>  8</td>\n",
        "      <td> 68</td>\n",
        "      <td> 82</td>\n",
        "      <td> 71</td>\n",
        "      <td> 34</td>\n",
        "      <td> 52</td>\n",
        "      <td> 61</td>\n",
        "      <td> 30</td>\n",
        "      <td> 36</td>\n",
        "      <td> 10</td>\n",
        "      <td> 28</td>\n",
        "      <td> 35</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2010-01-01</th>\n",
        "      <td> 476402</td>\n",
        "      <td> Republican</td>\n",
        "      <td> 0.011086</td>\n",
        "      <td> 0.026608</td>\n",
        "      <td> 88</td>\n",
        "      <td> 72</td>\n",
        "      <td> 68</td>\n",
        "      <td> 34</td>\n",
        "      <td> 59</td>\n",
        "      <td> 23</td>\n",
        "      <td> 10</td>\n",
        "      <td> 73</td>\n",
        "      <td> 36</td>\n",
        "      <td> 92</td>\n",
        "      <td>  8</td>\n",
        "      <td> 78</td>\n",
        "      <td> 30</td>\n",
        "      <td> 71</td>\n",
        "      <td> 80</td>\n",
        "      <td> 89</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 104 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 117,
       "text": [
        "             attrs       attrs  sentiment            distribution              \\\n",
        "                id       party        neg       pos             0   1   2   3   \n",
        "date                                                                            \n",
        "2012-01-01  337536  Republican   0.007203  0.042017            72  68  59  82   \n",
        "2012-01-01  376979    Democrat   0.017751  0.088757            23  68  51  82   \n",
        "2010-01-04  476373  Republican   0.025078  0.040752            59  80  68  51   \n",
        "2010-01-04  476393  Republican   0.009434  0.015723            72  31  59  88   \n",
        "2010-01-01  476402  Republican   0.011086  0.026608            88  72  68  34   \n",
        "\n",
        "                                                                \n",
        "             4   5   6   7   8   9  10  11  12  13  14  15      \n",
        "date                                                            \n",
        "2012-01-01  30  10  34  23   7  89  52  88   8  77  92  61 ...  \n",
        "2012-01-01  88  71  10  44  47  52  59  16  72  80  89  91 ...  \n",
        "2010-01-04  72  78  30  10  23  88   8  92  44  58  71  82 ...  \n",
        "2010-01-04   8  68  82  71  34  52  61  30  36  10  28  35 ...  \n",
        "2010-01-01  59  23  10  73  36  92   8  78  30  71  80  89 ...  \n",
        "\n",
        "[5 rows x 104 columns]"
       ]
      }
     ],
     "prompt_number": 117
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---\n",
      "### Functions for retrieving sentiment info"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# We want a function that will take in:\n",
      "#  - a list of topics\n",
      "#  - a \"maximum rank\"\n",
      "# and we want to get out:\n",
      "#  a week-indexed dataframe with the negative/positive sentiment \n",
      "#  for documents that have at least one topic in the list of topics\n",
      "#  in that 'maximum rank'\n",
      "\n",
      "def documents_with_topics(df, topics, n):\n",
      "    \"\"\"gets the subset of documents in df where at least one topic in topics\n",
      "    is in the top n topics for that document.\n",
      "    \"\"\"\n",
      "    columns = range(n)\n",
      "    dfs = [df[df['distribution'][col].isin(topics)] for col in columns]\n",
      "    return pd.concat(dfs)\n",
      "\n",
      "\n",
      "def weekly_sentiment_by_party_for_topics(df, topics, n):\n",
      "    \"\"\"Returns the weekly sentiment, by party, for all documents\n",
      "    that have a topic in `topics` in their top `n` topics.\n",
      "    Indexed by (year, week)\n",
      "    \"\"\"\n",
      "    docs = documents_with_topics(df, topics, n)\n",
      "    gb = docs.groupby([docs.index.year, docs.index.week, ('attrs', 'party')])\n",
      "    df = gb.agg({('sentiment', 'pos'): mean,\n",
      "                 ('sentiment', 'neg'): mean}).unstack()\n",
      "    return df.rename(columns={('sentiment', 'neg'): 'neg',\n",
      "                              ('sentiment', 'pos'): 'pos'})"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 341
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# later, we need a conversion of (year, week) -> month.\n",
      "# we can get it by querying the resulting dictionary\n",
      "month_df = pd.DataFrame(df)\n",
      "month_df['month'] = month_df.index.month\n",
      "month_df = month_df.groupby([df.index.year, df.index.week]).agg({'month': lambda l: list(set(l))[0]})\n",
      "month_dict = month_df.to_dict()['month']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 242
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# convenience method for accessing months from (year, week):\n",
      "def get_month(year, week):\n",
      "    return month_dict[(year, week)]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 243
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# convenience method for accessing sentiment\n",
      "def sentiment_by_party(topic, party, sent):\n",
      "    return weekly_sentiment_by_party_for_topics(new_df, [topic], 5)[sent][party]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 198
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---\n",
      "## Count data\n",
      "Functions, etc"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# given:\n",
      "#  - a topic\n",
      "#  - a party\n",
      "# we want to get out:\n",
      "#  a list of statement counts, normalized by the number of\n",
      "#  representatives in that party during that time.\n",
      "#  the length of this list should be equal to the number of\n",
      "#  weeks in our corpus.\n",
      "\n",
      "def get_counts(t, party):\n",
      "    \"\"\"dict of {time : count} for the given party\n",
      "    \"\"\"\n",
      "    file_dir = \"../../../data/topics/{0}/{1}Weekly_topic.pkl\".format(t, party.lower())\n",
      "    return pickle.load(open(file_dir, 'rb'))\n",
      "\n",
      "def dem_normalize_list(vals):\n",
      "    \"\"\"normalizing values for democratic statement counts\n",
      "    \"\"\"\n",
      "    return [57+257 for i in range(54)] + \\\n",
      "           [51+193 for i in range(54,158)] + \\\n",
      "           [53+199 for i in range(158, len(vals))]\n",
      "    \n",
      "def rep_normalize_list(vals):\n",
      "    \"\"\"normalizing values for republican statement counts\n",
      "    \"\"\"\n",
      "    return [41+178 for i in range(54)] + \\\n",
      "           [47+242 for i in range(54,158)] + \\\n",
      "           [45+234 for i in range(158, len(vals))]\n",
      "\n",
      "def divide_lists(l1, l2):\n",
      "    \"\"\"returns a list of the ratio of l1:l2\n",
      "    \"\"\"\n",
      "    return [float(x) / float(y) for x, y in zip(l1, l2)]\n",
      "        \n",
      "def normalize_counts(vals, party):\n",
      "    if party.lower() == \"republican\":\n",
      "        nlist = rep_normalize_list(vals)\n",
      "    else:\n",
      "        nlist = dem_normalize_list(vals)\n",
      "\n",
      "    return divide_lists(vals, nlist)\n",
      "\n",
      "\n",
      "def get_normalized_counts(t, party):\n",
      "    \"\"\"a (year, week) indexed series of normalized statement counts.\n",
      "    A statement is only counted if `t` is one of the statement's top 5 topics.\n",
      "    Counts are normalized by the number of representatives in `party` when the statement \n",
      "    was released.\n",
      "    \"\"\"\n",
      "    # get the counts, normalize them\n",
      "    counts = get_counts(t, party)\n",
      "    vals = normalize_counts(counts.values(), party)\n",
      "    \n",
      "    # convert to series, convert the index to datetime\n",
      "    series = pd.Series(dict(zip(counts.keys(), vals)))\n",
      "    series.index = pd.to_datetime(series.index)\n",
      "    \n",
      "    # change index to (year, week)\n",
      "    return series.groupby([s.index.year, s.index.week]).sum()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 163
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---\n",
      "## Correlations"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# getting the opposite party. made difficult because oren had 'democrats' in the filenames.\n",
      "opposite = {'democrat': 'republican',\n",
      "            'democrats': 'republican',\n",
      "            'republican': 'democrats'}\n",
      "\n",
      "\n",
      "def create_regression_df(df):\n",
      "    \"\"\"creates the regression dataframe using the data in df\n",
      "    and other necessary controls.\n",
      "    \"\"\"\n",
      "    months = [get_month(t[0], t[1]) for t in list(reg_example_df.index)]\n",
      "    df['month'] = months\n",
      "    return df\n",
      "\n",
      "\n",
      "def create_regression_formula(dep_var, reg_df):\n",
      "    \"\"\"creates the regression formula for OLS given \n",
      "    the dependent variable string and the dataframe that has\n",
      "    columns for the dependent variable and all independent variables.\n",
      "    \"\"\"\n",
      "    formula = \"{0} ~ \".format(dep_var)\n",
      "    \n",
      "    # add the rest of the variables to the formula string\n",
      "    cols = [c for c in reg_df.columns if c not in ['month', dep_var]]\n",
      "    for n, column in enumerate(cols):\n",
      "        if n == 0:\n",
      "            formula += column\n",
      "        else:\n",
      "            formula += \" + {0}\".format(column)\n",
      "    \n",
      "    formula += \" + C(month)\"\n",
      "    \n",
      "    return formula\n",
      "\n",
      "################################################\n",
      "# equation for count/sentiment regression:\n",
      "#   count ~ lag_count + lag_sentiment + C(months)\n",
      "# count/count regression:\n",
      "#   count ~ lag_count + lag_count_opp + C(months)\n",
      "################################################\n",
      "\n",
      "    \n",
      "def do_count_regression(atopic, btopic, party, shift):\n",
      "    \"\"\"predicting the count of documents in topic for party by the\n",
      "    lagged count of documents in the opposing party.\n",
      "    shift indicates how much to lag by.\n",
      "    \"\"\"\n",
      "    opposite_party = opposite[party.lower()]\n",
      "    \n",
      "    # get the normalized counts for this party and the opposite party\n",
      "    counts = get_normalized_counts(atopic, party)\n",
      "    counts_opp = get_normalized_counts(btopic, opposite_party)\n",
      "    \n",
      "    # define the dependent variable\n",
      "    dep_var = '{0}_counts'.format(party)\n",
      "    \n",
      "    # create the dataframe with the correct column names based on party\n",
      "    reg_df = pd.DataFrame({'{0}_counts'.format(party) : counts,\n",
      "                           '{0}_{1}_lag_counts'.format(party, atopic): counts.shift(),\n",
      "                           '{0}_{1}_lag_{2}_counts'.format(opposite_party, btopic, shift): counts_opp.shift(shift)})\n",
      "    \n",
      "    # create the regression dataframe, adding in categorical months etc.\n",
      "    reg_df = create_regression_df(reg_df)\n",
      "    \n",
      "    # create formula string\n",
      "    formula = create_regression_formula(dep_var, reg_df)\n",
      "    \n",
      "    # run the regression\n",
      "    est = smf.ols(formula=formula, data=reg_df).fit()\n",
      "    return est\n",
      "\n",
      "\n",
      "def do_sentiment_regression(atopic, btopic, party, sent, shift):       \n",
      "    \"\"\"predicting the count of documents in topic for party by the lagged sentiment\n",
      "    of type `sent` expressed by the opposing party.\n",
      "    \"\"\"\n",
      "    opposite_party = opposite[party.lower()]\n",
      "    \n",
      "    # get the normalized counts for this party\n",
      "    counts = get_normalized_counts(atopic, party)\n",
      "    \n",
      "    # sentiment for opposite party on btopic\n",
      "    sentiment_opp = sentiment_by_party(btopic, opposite_party.title(), sent)\n",
      "    \n",
      "    # define dependent variable\n",
      "    dep_var = '{0}_counts'.format(party)\n",
      "    \n",
      "    # create the dataframe with the correct column names based on party\n",
      "    reg_df = pd.DataFrame({dep_var : counts,\n",
      "                           '{0}_{1}_lag_counts'.format(party, atopic): counts.shift(),\n",
      "                           '{0}_{1}_{2}_lag_{3}_sent'.format(opposite_party, sent, btopic, shift): sentiment_opp.shift(shift)})    \n",
      "    \n",
      "    # create regression dataframe and formula\n",
      "    reg_df = create_regression_df(reg_df)\n",
      "    formula = create_regression_formula(dep_var, reg_df)\n",
      "    \n",
      "    # run regression\n",
      "    est = smf.ols(formula=formula, data=reg_df).fit()\n",
      "    return est"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 405
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "---\n",
      "## Generate results, write to disk:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import itertools\n",
      "import os\n",
      "\n",
      "# these are the clusters we're interested in\n",
      "clusters = {'energy': [38, 47, 69, 71],\n",
      "            'health': [30, 51, 80]}\n",
      "\n",
      "# get list of all topics, flattened, then get all topic pairs\n",
      "all_topics = itertools.chain(*clusters.values())\n",
      "topic_pairs = itertools.permutations(all_topics, 2)\n",
      "\n",
      "# all our shifts\n",
      "shifts = [0, 1, 2]\n",
      "\n",
      "\n",
      "# files will be saved as:\n",
      "# ../../../data/results/correlations/{0}/{1}/{2}/{3}/{4}.html\n",
      "# where:\n",
      "#  {0} : one of {Democrats_counts, Republican_counts}, depending on the value\n",
      "#         being predicted\n",
      "#  {1} : the topic # whose counts are being predicted\n",
      "#  {2} : one of {sent, count}, depending on the predictor\n",
      "#  {3} : the lag amount used\n",
      "#  {4} : the predictor topic #\n",
      "\n",
      "\n",
      "def write_to_path(content, filename):\n",
      "    if not os.path.exists(os.path.dirname(filename)):\n",
      "        os.makedirs(os.path.dirname(filename))\n",
      "    with open(filename, 'wb') as f:\n",
      "        f.write(content)\n",
      "\n",
      "significant = [] # results that are significant    \n",
      "        \n",
      "data_dir = '../../../data/results/correlations/'\n",
      "\n",
      "# count regression - iterate through all options\n",
      "\n",
      "for topics in topic_pairs:\n",
      "    for shift in shifts:\n",
      "        for party in ['Democrats', 'Republican']:\n",
      "            for kind in ['count', 'sent']:\n",
      "                # if it's a 'count regression', just do it\n",
      "                if kind == 'count':\n",
      "                    res = do_count_regression(topics[0], topics[1], party, shift)\n",
      "                    to_write = str(res.summary())\n",
      "                    filename = \"{0}_counts/{1}/{2}/{3}/{4}.txt\".format(party,\n",
      "                                                                        topics[0],\n",
      "                                                                        kind,\n",
      "                                                                        shift,\n",
      "                                                                        topics[1])\n",
      "                    filename = os.path.join(data_dir, filename)\n",
      "                    write_to_path(to_write, filename)\n",
      "                    \n",
      "                    # if the predictor is significant, add it to a list\n",
      "                    if res.pvalues.tail(1).iloc[0] < .05:\n",
      "                        significant.append(filename)\n",
      "\n",
      "                # otherwise, we need to iterate through the sentiment categories\n",
      "                # (I know this sucks)\n",
      "                if kind == 'sent':\n",
      "                    for sent_type in ['pos', 'neg']:\n",
      "                        res = do_count_regression(topics[0], topics[1], party, shift)\n",
      "                        to_write = str(res.summary())\n",
      "                        filename = \"{0}_counts/{1}/{2}/{3}/{4}/{5}.txt\".format(party,\n",
      "                                                                            topics[0],\n",
      "                                                                            kind,\n",
      "                                                                            sent_type,\n",
      "                                                                            shift,\n",
      "                                                                            topics[1])\n",
      "                        filename = os.path.join(data_dir, filename)\n",
      "                        write_to_path(to_write, filename)\n",
      "                        # if the predictor is significant, add it to a list\n",
      "                        if res.pvalues.tail(1).iloc[0] < .05:\n",
      "                            significant.append(filename)\n",
      "\n",
      "# write significant results to their own file, line by line:\n",
      "with open(os.path.join(data_dir, 'significant.txt'), 'wb') as f:\n",
      "    f.write('\\n'.join(significant))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 441
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "all_topics = itertools.chain(*clusters.values())\n",
      "topic_pairs = itertools.permutations(all_topics, 2)\n",
      "list(topic_pairs)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 440,
       "text": [
        "[(38, 47),\n",
        " (38, 69),\n",
        " (38, 71),\n",
        " (38, 30),\n",
        " (38, 51),\n",
        " (38, 80),\n",
        " (47, 38),\n",
        " (47, 69),\n",
        " (47, 71),\n",
        " (47, 30),\n",
        " (47, 51),\n",
        " (47, 80),\n",
        " (69, 38),\n",
        " (69, 47),\n",
        " (69, 71),\n",
        " (69, 30),\n",
        " (69, 51),\n",
        " (69, 80),\n",
        " (71, 38),\n",
        " (71, 47),\n",
        " (71, 69),\n",
        " (71, 30),\n",
        " (71, 51),\n",
        " (71, 80),\n",
        " (30, 38),\n",
        " (30, 47),\n",
        " (30, 69),\n",
        " (30, 71),\n",
        " (30, 51),\n",
        " (30, 80),\n",
        " (51, 38),\n",
        " (51, 47),\n",
        " (51, 69),\n",
        " (51, 71),\n",
        " (51, 30),\n",
        " (51, 80),\n",
        " (80, 38),\n",
        " (80, 47),\n",
        " (80, 69),\n",
        " (80, 71),\n",
        " (80, 30),\n",
        " (80, 51)]"
       ]
      }
     ],
     "prompt_number": 440
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "res = do_sentiment_regression(30, 30, 'Democrats', 'pos', 1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 349
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "res = do_count_regression(30, 30, 'Democrats', 0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 391
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "res.pvalues.tail(1).iloc[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 416,
       "text": [
        "0.40403599607963181"
       ]
      }
     ],
     "prompt_number": 416
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "res.tvalues"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 369,
       "text": [
        "Intercept                   1.821789e+00\n",
        "C(month)[T.2]              -2.537987e+00\n",
        "C(month)[T.3]               1.709296e-01\n",
        "C(month)[T.4]              -3.261010e+00\n",
        "C(month)[T.5]              -4.321774e+00\n",
        "C(month)[T.6]              -4.500173e+00\n",
        "C(month)[T.7]              -4.196597e+00\n",
        "C(month)[T.8]              -2.763157e+00\n",
        "C(month)[T.9]              -1.952323e+00\n",
        "C(month)[T.10]             -5.307118e+00\n",
        "C(month)[T.11]             -6.063182e+00\n",
        "C(month)[T.12]             -5.222220e+00\n",
        "Democrats_30_lag_counts     4.296383e+15\n",
        "republican_30_lag_counts    3.273286e+00\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 369
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "res.params"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 393,
       "text": [
        "Intercept                     0.009526\n",
        "C(month)[T.2]                 0.013784\n",
        "C(month)[T.3]                 0.027370\n",
        "C(month)[T.4]                 0.029769\n",
        "C(month)[T.5]                 0.025660\n",
        "C(month)[T.6]                 0.077222\n",
        "C(month)[T.7]                -0.016947\n",
        "C(month)[T.8]                 0.007844\n",
        "C(month)[T.9]                 0.011402\n",
        "C(month)[T.10]               -0.025175\n",
        "C(month)[T.11]               -0.010383\n",
        "C(month)[T.12]                0.040636\n",
        "Democrats_30_lag_counts       0.090149\n",
        "republican_30_lag_0_counts    0.590797\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 393
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "HTML(res.summary().as_html())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<table class=\"simpletable\">\n",
        "<caption>OLS Regression Results</caption>\n",
        "<tr>\n",
        "  <th>Dep. Variable:</th>    <td>Democrats_counts</td> <th>  R-squared:         </th> <td>   0.730</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.712</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   40.38</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Date:</th>             <td>Fri, 18 Jul 2014</td> <th>  Prob (F-statistic):</th> <td>4.17e-48</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Time:</th>                 <td>17:30:32</td>     <th>  Log-Likelihood:    </th> <td>  251.04</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>No. Observations:</th>      <td>   208</td>      <th>  AIC:               </th> <td>  -474.1</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Df Residuals:</th>          <td>   194</td>      <th>  BIC:               </th> <td>  -427.4</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Df Model:</th>              <td>    13</td>      <th>                     </th>     <td> </td>   \n",
        "</tr>\n",
        "</table>\n",
        "<table class=\"simpletable\">\n",
        "<tr>\n",
        "               <td></td>                 <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th> <th>[95.0% Conf. Int.]</th> \n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Intercept</th>                  <td>    0.0095</td> <td>    0.017</td> <td>    0.550</td> <td> 0.583</td> <td>   -0.025     0.044</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.2]</th>              <td>    0.0138</td> <td>    0.025</td> <td>    0.554</td> <td> 0.580</td> <td>   -0.035     0.063</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.3]</th>              <td>    0.0274</td> <td>    0.025</td> <td>    1.077</td> <td> 0.283</td> <td>   -0.023     0.077</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.4]</th>              <td>    0.0298</td> <td>    0.024</td> <td>    1.233</td> <td> 0.219</td> <td>   -0.018     0.077</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.5]</th>              <td>    0.0257</td> <td>    0.024</td> <td>    1.063</td> <td> 0.289</td> <td>   -0.022     0.073</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.6]</th>              <td>    0.0772</td> <td>    0.025</td> <td>    3.095</td> <td> 0.002</td> <td>    0.028     0.126</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.7]</th>              <td>   -0.0169</td> <td>    0.026</td> <td>   -0.654</td> <td> 0.514</td> <td>   -0.068     0.034</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.8]</th>              <td>    0.0078</td> <td>    0.023</td> <td>    0.338</td> <td> 0.735</td> <td>   -0.038     0.054</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.9]</th>              <td>    0.0114</td> <td>    0.025</td> <td>    0.463</td> <td> 0.644</td> <td>   -0.037     0.060</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.10]</th>             <td>   -0.0252</td> <td>    0.024</td> <td>   -1.045</td> <td> 0.297</td> <td>   -0.073     0.022</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.11]</th>             <td>   -0.0104</td> <td>    0.024</td> <td>   -0.425</td> <td> 0.672</td> <td>   -0.059     0.038</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>C(month)[T.12]</th>             <td>    0.0406</td> <td>    0.026</td> <td>    1.566</td> <td> 0.119</td> <td>   -0.011     0.092</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Democrats_30_lag_counts</th>    <td>    0.0901</td> <td>    0.040</td> <td>    2.248</td> <td> 0.026</td> <td>    0.011     0.169</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>republican_30_lag_0_counts</th> <td>    0.5908</td> <td>    0.029</td> <td>   20.349</td> <td> 0.000</td> <td>    0.534     0.648</td>\n",
        "</tr>\n",
        "</table>\n",
        "<table class=\"simpletable\">\n",
        "<tr>\n",
        "  <th>Omnibus:</th>       <td>80.910</td> <th>  Durbin-Watson:     </th> <td>   1.572</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td> 406.507</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Skew:</th>          <td> 1.426</td> <th>  Prob(JB):          </th> <td>5.35e-89</td>\n",
        "</tr>\n",
        "<tr>\n",
        "  <th>Kurtosis:</th>      <td> 9.226</td> <th>  Cond. No.          </th> <td>    12.2</td>\n",
        "</tr>\n",
        "</table>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 399,
       "text": [
        "<IPython.core.display.HTML at 0x47e5db10>"
       ]
      }
     ],
     "prompt_number": 399
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Example code "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import statsmodels.formula.api as smf\n",
      "from IPython.core.display import HTML\n",
      "\n",
      "def short_summary(est):\n",
      "    \"\"\"provides a nice visual summary of the model estimates\n",
      "    \"\"\"\n",
      "    return HTML(est.summary().tables[1].as_html())\n",
      "\n",
      "# gotta have:\n",
      "# count , lag count, lag sentiment, months, week\n",
      "# C(months) creates len(months) dummy variables from the categorical variable\n",
      "# use * there because otherwise it just influences the intercept (?)\n",
      "def ols_regression(df):\n",
      "    est = smf.ols(formula='count ~ lag_count + lag_sentiment + C(months)',\n",
      "                  data=df).fit()\n",
      "    return short_summary(est)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 260
    }
   ],
   "metadata": {}
  }
 ]
 }