Skip to content

Instantly share code, notes, and snippets.

@dcalacci
Last active August 29, 2015 14:04
Show Gist options
  • Save dcalacci/4fe68425a5a16c59810c to your computer and use it in GitHub Desktop.
Save dcalacci/4fe68425a5a16c59810c to your computer and use it in GitHub Desktop.
time series correlation of sentiment and statement counts
{
"metadata": {
"name": "",
"signature": "sha256:ba1951a9424c0d8948a247be8ff6ed159e05dc19bdaaf59eaacb0f7e7f66ca45"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import pickle"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# add project-specific src directories\n",
"import sys\n",
"sys.path.insert(2, '/home/dcalacci/lazerlab/congressional-public-speech/dan/src')\n",
"# project imports\n",
"from analysis import authors\n",
"import analysis\n",
"import datacleaning"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"all_lambda = np.loadtxt('../../../data/models/combined/lambda-2270.dat')\n",
"vocab = datacleaning.load_and_filter_vocab('../../../lexicons/vocab_general.txt')[:10000]\n",
"lda = analysis.lda.LDA(all_lambda, vocab)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sentiment_df = pd.read_csv('../../../data/individual_2010-2013_pr-st_sentiment.csv', sep=\"\\t\", index_col=0,\n",
" parse_dates=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.read_csv('../../../data/models/combined/document_distributions.csv', \n",
" sep='\\t', \n",
" index_col=0,\n",
" parse_dates=['date'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>author</th>\n",
" <th>party</th>\n",
" <th>distribution</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>826938</th>\n",
" <td>2013-11-14</td>\n",
" <td> 3457</td>\n",
" <td> Democrat</td>\n",
" <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.00735294117647 0.022...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>826935</th>\n",
" <td>2013-11-11</td>\n",
" <td> 3457</td>\n",
" <td> Democrat</td>\n",
" <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.00613496932515 0.012...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>823397</th>\n",
" <td>2013-10-24</td>\n",
" <td> 3457</td>\n",
" <td> Democrat</td>\n",
" <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015873015873...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>823400</th>\n",
" <td>2013-10-23</td>\n",
" <td> 3457</td>\n",
" <td> Democrat</td>\n",
" <td> 0.0 0.00333333333333 0.0 0.0 0.0 0.0 0.0066666...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>823399</th>\n",
" <td>2013-10-23</td>\n",
" <td> 3457</td>\n",
" <td> Democrat</td>\n",
" <td> 0.0 0.0 0.0 0.0 0.0 0.0 0.0169491525424 0.0169...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 4 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 41,
"text": [
" date author party \\\n",
"id \n",
"826938 2013-11-14 3457 Democrat \n",
"826935 2013-11-11 3457 Democrat \n",
"823397 2013-10-24 3457 Democrat \n",
"823400 2013-10-23 3457 Democrat \n",
"823399 2013-10-23 3457 Democrat \n",
"\n",
" distribution \n",
"id \n",
"826938 0.0 0.0 0.0 0.0 0.0 0.0 0.00735294117647 0.022... \n",
"826935 0.0 0.0 0.0 0.0 0.0 0.0 0.00613496932515 0.012... \n",
"823397 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015873015873... \n",
"823400 0.0 0.00333333333333 0.0 0.0 0.0 0.0 0.0066666... \n",
"823399 0.0 0.0 0.0 0.0 0.0 0.0 0.0169491525424 0.0169... \n",
"\n",
"[5 rows x 4 columns]"
]
}
],
"prompt_number": 41
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Data cleaning / setup for sentiment"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# make the distribution a list\n",
"df['distribution'] = df['distribution'].apply(lambda l: l.split())\n",
"\n",
"# join up the sentiment and distributions into one dataframe\n",
"df = df.join(sentiment_df)\n",
"\n",
"df.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>author</th>\n",
" <th>party</th>\n",
" <th>distribution</th>\n",
" <th>neg</th>\n",
" <th>pos</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>337536</th>\n",
" <td>2012-01-01</td>\n",
" <td> 103482</td>\n",
" <td> Republican</td>\n",
" <td> [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....</td>\n",
" <td> 0.007203</td>\n",
" <td> 0.042017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>376979</th>\n",
" <td>2012-01-01</td>\n",
" <td> 142</td>\n",
" <td> Democrat</td>\n",
" <td> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...</td>\n",
" <td> 0.017751</td>\n",
" <td> 0.088757</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows \u00d7 6 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 42,
"text": [
" date author party \\\n",
"id \n",
"337536 2012-01-01 103482 Republican \n",
"376979 2012-01-01 142 Democrat \n",
"\n",
" distribution neg pos \n",
"id \n",
"337536 [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0.... 0.007203 0.042017 \n",
"376979 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338... 0.017751 0.088757 \n",
"\n",
"[2 rows x 6 columns]"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# set the index to be the date, rather than the ID.\n",
"df = df.reset_index()\n",
"df = df.set_index('date')\n",
"\n",
"# keep only democrats/republicans\n",
"df = df[df['party'].isin(['Democrat', 'Republican'])]\n",
"\n",
"df.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>author</th>\n",
" <th>party</th>\n",
" <th>distribution</th>\n",
" <th>neg</th>\n",
" <th>pos</th>\n",
" </tr>\n",
" <tr>\n",
" <th>date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 337536</td>\n",
" <td> 103482</td>\n",
" <td> Republican</td>\n",
" <td> [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0....</td>\n",
" <td> 0.007203</td>\n",
" <td> 0.042017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 376979</td>\n",
" <td> 142</td>\n",
" <td> Democrat</td>\n",
" <td> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338...</td>\n",
" <td> 0.017751</td>\n",
" <td> 0.088757</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows \u00d7 6 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 113,
"text": [
" id author party \\\n",
"date \n",
"2012-01-01 337536 103482 Republican \n",
"2012-01-01 376979 142 Democrat \n",
"\n",
" distribution neg \\\n",
"date \n",
"2012-01-01 [0.0, 0.00579150579151, 0.0, 0.0, 0.0, 0.0, 0.... 0.007203 \n",
"2012-01-01 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02479338... 0.017751 \n",
"\n",
" pos \n",
"date \n",
"2012-01-01 0.042017 \n",
"2012-01-01 0.088757 \n",
"\n",
"[2 rows x 6 columns]"
]
}
],
"prompt_number": 113
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# make a new dataframe that has a multi-index for columns.\n",
"# we'll have 'attrs', 'sentiment', and 'distribution'\n",
"# index will still be the document ID.\n",
"new_df = pd.concat([df[[\"id\", \"party\"]], \n",
" df[[\"neg\", \"pos\"]],\n",
" pd.DataFrame(df.distribution.tolist(), index=df.index)], \n",
" axis=1, keys=[\"attrs\", \"sentiment\", \"distribution\"])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 114
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"new_df.head(2)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">attrs</th>\n",
" <th colspan=\"2\" halign=\"left\">sentiment</th>\n",
" <th colspan=\"16\" halign=\"left\">distribution</th>\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>party</th>\n",
" <th>neg</th>\n",
" <th>pos</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>10</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>13</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 337536</td>\n",
" <td> Republican</td>\n",
" <td> 0.007203</td>\n",
" <td> 0.042017</td>\n",
" <td> 0.0</td>\n",
" <td> 0.00579150579151</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.015444015444</td>\n",
" <td> 0.0328185328185</td>\n",
" <td> 0.0289575289575</td>\n",
" <td> 0.00772200772201</td>\n",
" <td> 0.0366795366795</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0019305019305</td>\n",
" <td> 0.003861003861</td>\n",
" <td> 0.00772200772201</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 376979</td>\n",
" <td> Democrat</td>\n",
" <td> 0.017751</td>\n",
" <td> 0.088757</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0247933884298</td>\n",
" <td> 0.00826446280992</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0413223140496</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.0</td>\n",
" <td> 0.00826446280992</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows \u00d7 104 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 115,
"text": [
" attrs attrs sentiment distribution \\\n",
" id party neg pos 0 \n",
"date \n",
"2012-01-01 337536 Republican 0.007203 0.042017 0.0 \n",
"2012-01-01 376979 Democrat 0.017751 0.088757 0.0 \n",
"\n",
" \\\n",
" 1 2 3 4 5 6 \n",
"date \n",
"2012-01-01 0.00579150579151 0.0 0.0 0.0 0.0 0.015444015444 \n",
"2012-01-01 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" \\\n",
" 7 8 9 \n",
"date \n",
"2012-01-01 0.0328185328185 0.0289575289575 0.00772200772201 \n",
"2012-01-01 0.0247933884298 0.00826446280992 0.0 \n",
"\n",
" \\\n",
" 10 11 12 13 14 \n",
"date \n",
"2012-01-01 0.0366795366795 0.0 0.0 0.0019305019305 0.003861003861 \n",
"2012-01-01 0.0413223140496 0.0 0.0 0.0 0.0 \n",
"\n",
" \n",
" 15 \n",
"date \n",
"2012-01-01 0.00772200772201 ... \n",
"2012-01-01 0.00826446280992 ... \n",
"\n",
"[2 rows x 104 columns]"
]
}
],
"prompt_number": 115
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# we want a new dataframe that looks like this:\n",
"# 0 1 2\n",
"# 72 68 59\n",
"# 23 68 21\n",
"# where column 0 is the top topic for that document, column 1 is the 2nd most prevalent, etc.\n",
"ordered = []\n",
"for n, row in new_df.iterrows():\n",
" ordered.append(analysis.lda.similarity.get_topic_range(row.distribution, \n",
" (0,len(row.distribution))))\n",
" \n",
"# create dataframe, make the indices the same\n",
"ordered_topics_df = pd.DataFrame(ordered).set_index(new_df.index)\n",
"\n",
"# swap the old 'distribution' dataframe out with this one\n",
"new_df['distribution'] = ordered_topics_df"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 116
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"new_df.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">attrs</th>\n",
" <th colspan=\"2\" halign=\"left\">sentiment</th>\n",
" <th colspan=\"16\" halign=\"left\">distribution</th>\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>party</th>\n",
" <th>neg</th>\n",
" <th>pos</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>10</th>\n",
" <th>11</th>\n",
" <th>12</th>\n",
" <th>13</th>\n",
" <th>14</th>\n",
" <th>15</th>\n",
" <th></th>\n",
" </tr>\n",
" <tr>\n",
" <th>date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 337536</td>\n",
" <td> Republican</td>\n",
" <td> 0.007203</td>\n",
" <td> 0.042017</td>\n",
" <td> 72</td>\n",
" <td> 68</td>\n",
" <td> 59</td>\n",
" <td> 82</td>\n",
" <td> 30</td>\n",
" <td> 10</td>\n",
" <td> 34</td>\n",
" <td> 23</td>\n",
" <td> 7</td>\n",
" <td> 89</td>\n",
" <td> 52</td>\n",
" <td> 88</td>\n",
" <td> 8</td>\n",
" <td> 77</td>\n",
" <td> 92</td>\n",
" <td> 61</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2012-01-01</th>\n",
" <td> 376979</td>\n",
" <td> Democrat</td>\n",
" <td> 0.017751</td>\n",
" <td> 0.088757</td>\n",
" <td> 23</td>\n",
" <td> 68</td>\n",
" <td> 51</td>\n",
" <td> 82</td>\n",
" <td> 88</td>\n",
" <td> 71</td>\n",
" <td> 10</td>\n",
" <td> 44</td>\n",
" <td> 47</td>\n",
" <td> 52</td>\n",
" <td> 59</td>\n",
" <td> 16</td>\n",
" <td> 72</td>\n",
" <td> 80</td>\n",
" <td> 89</td>\n",
" <td> 91</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2010-01-04</th>\n",
" <td> 476373</td>\n",
" <td> Republican</td>\n",
" <td> 0.025078</td>\n",
" <td> 0.040752</td>\n",
" <td> 59</td>\n",
" <td> 80</td>\n",
" <td> 68</td>\n",
" <td> 51</td>\n",
" <td> 72</td>\n",
" <td> 78</td>\n",
" <td> 30</td>\n",
" <td> 10</td>\n",
" <td> 23</td>\n",
" <td> 88</td>\n",
" <td> 8</td>\n",
" <td> 92</td>\n",
" <td> 44</td>\n",
" <td> 58</td>\n",
" <td> 71</td>\n",
" <td> 82</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2010-01-04</th>\n",
" <td> 476393</td>\n",
" <td> Republican</td>\n",
" <td> 0.009434</td>\n",
" <td> 0.015723</td>\n",
" <td> 72</td>\n",
" <td> 31</td>\n",
" <td> 59</td>\n",
" <td> 88</td>\n",
" <td> 8</td>\n",
" <td> 68</td>\n",
" <td> 82</td>\n",
" <td> 71</td>\n",
" <td> 34</td>\n",
" <td> 52</td>\n",
" <td> 61</td>\n",
" <td> 30</td>\n",
" <td> 36</td>\n",
" <td> 10</td>\n",
" <td> 28</td>\n",
" <td> 35</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2010-01-01</th>\n",
" <td> 476402</td>\n",
" <td> Republican</td>\n",
" <td> 0.011086</td>\n",
" <td> 0.026608</td>\n",
" <td> 88</td>\n",
" <td> 72</td>\n",
" <td> 68</td>\n",
" <td> 34</td>\n",
" <td> 59</td>\n",
" <td> 23</td>\n",
" <td> 10</td>\n",
" <td> 73</td>\n",
" <td> 36</td>\n",
" <td> 92</td>\n",
" <td> 8</td>\n",
" <td> 78</td>\n",
" <td> 30</td>\n",
" <td> 71</td>\n",
" <td> 80</td>\n",
" <td> 89</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows \u00d7 104 columns</p>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 117,
"text": [
" attrs attrs sentiment distribution \\\n",
" id party neg pos 0 1 2 3 \n",
"date \n",
"2012-01-01 337536 Republican 0.007203 0.042017 72 68 59 82 \n",
"2012-01-01 376979 Democrat 0.017751 0.088757 23 68 51 82 \n",
"2010-01-04 476373 Republican 0.025078 0.040752 59 80 68 51 \n",
"2010-01-04 476393 Republican 0.009434 0.015723 72 31 59 88 \n",
"2010-01-01 476402 Republican 0.011086 0.026608 88 72 68 34 \n",
"\n",
" \n",
" 4 5 6 7 8 9 10 11 12 13 14 15 \n",
"date \n",
"2012-01-01 30 10 34 23 7 89 52 88 8 77 92 61 ... \n",
"2012-01-01 88 71 10 44 47 52 59 16 72 80 89 91 ... \n",
"2010-01-04 72 78 30 10 23 88 8 92 44 58 71 82 ... \n",
"2010-01-04 8 68 82 71 34 52 61 30 36 10 28 35 ... \n",
"2010-01-01 59 23 10 73 36 92 8 78 30 71 80 89 ... \n",
"\n",
"[5 rows x 104 columns]"
]
}
],
"prompt_number": 117
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"### Functions for retrieving sentiment info"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# We want a function that will take in:\n",
"# - a list of topics\n",
"# - a \"maximum rank\"\n",
"# and we want to get out:\n",
"# a week-indexed dataframe with the negative/positive sentiment \n",
"# for documents that have at least one topic in the list of topics\n",
"# in that 'maximum rank'\n",
"\n",
"def documents_with_topics(df, topics, n):\n",
" \"\"\"gets the subset of documents in df where at least one topic in topics\n",
" is in the top n topics for that document.\n",
" \"\"\"\n",
" columns = range(n)\n",
" dfs = [df[df['distribution'][col].isin(topics)] for col in columns]\n",
" return pd.concat(dfs)\n",
"\n",
"\n",
"def weekly_sentiment_by_party_for_topics(df, topics, n):\n",
" \"\"\"Returns the weekly sentiment, by party, for all documents\n",
" that have a topic in `topics` in their top `n` topics.\n",
" Indexed by (year, week)\n",
" \"\"\"\n",
" docs = documents_with_topics(df, topics, n)\n",
" gb = docs.groupby([docs.index.year, docs.index.week, ('attrs', 'party')])\n",
" df = gb.agg({('sentiment', 'pos'): mean,\n",
" ('sentiment', 'neg'): mean}).unstack()\n",
" return df.rename(columns={('sentiment', 'neg'): 'neg',\n",
" ('sentiment', 'pos'): 'pos'})"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 341
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# later, we need a conversion of (year, week) -> month.\n",
"# we can get it by querying the resulting dictionary\n",
"month_df = pd.DataFrame(df)\n",
"month_df['month'] = month_df.index.month\n",
"month_df = month_df.groupby([df.index.year, df.index.week]).agg({'month': lambda l: list(set(l))[0]})\n",
"month_dict = month_df.to_dict()['month']"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 242
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# convenience method for accessing months from (year, week):\n",
"def get_month(year, week):\n",
" return month_dict[(year, week)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 243
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# convenience method for accessing sentiment\n",
"def sentiment_by_party(topic, party, sent):\n",
" return weekly_sentiment_by_party_for_topics(new_df, [topic], 5)[sent][party]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 198
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Count data\n",
"Functions, etc"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# given:\n",
"# - a topic\n",
"# - a party\n",
"# we want to get out:\n",
"# a list of statement counts, normalized by the number of\n",
"# representatives in that party during that time.\n",
"# the length of this list should be equal to the number of\n",
"# weeks in our corpus.\n",
"\n",
"def get_counts(t, party):\n",
" \"\"\"dict of {time : count} for the given party\n",
" \"\"\"\n",
" file_dir = \"../../../data/topics/{0}/{1}Weekly_topic.pkl\".format(t, party.lower())\n",
" return pickle.load(open(file_dir, 'rb'))\n",
"\n",
"def dem_normalize_list(vals):\n",
" \"\"\"normalizing values for democratic statement counts\n",
" \"\"\"\n",
" return [57+257 for i in range(54)] + \\\n",
" [51+193 for i in range(54,158)] + \\\n",
" [53+199 for i in range(158, len(vals))]\n",
" \n",
"def rep_normalize_list(vals):\n",
" \"\"\"normalizing values for republican statement counts\n",
" \"\"\"\n",
" return [41+178 for i in range(54)] + \\\n",
" [47+242 for i in range(54,158)] + \\\n",
" [45+234 for i in range(158, len(vals))]\n",
"\n",
"def divide_lists(l1, l2):\n",
" \"\"\"returns a list of the ratio of l1:l2\n",
" \"\"\"\n",
" return [float(x) / float(y) for x, y in zip(l1, l2)]\n",
" \n",
"def normalize_counts(vals, party):\n",
" if party.lower() == \"republican\":\n",
" nlist = rep_normalize_list(vals)\n",
" else:\n",
" nlist = dem_normalize_list(vals)\n",
"\n",
" return divide_lists(vals, nlist)\n",
"\n",
"\n",
"def get_normalized_counts(t, party):\n",
" \"\"\"a (year, week) indexed series of normalized statement counts.\n",
" A statement is only counted if `t` is one of the statement's top 5 topics.\n",
" Counts are normalized by the number of representatives in `party` when the statement \n",
" was released.\n",
" \"\"\"\n",
" # get the counts, normalize them\n",
" counts = get_counts(t, party)\n",
" vals = normalize_counts(counts.values(), party)\n",
" \n",
" # convert to series, convert the index to datetime\n",
" series = pd.Series(dict(zip(counts.keys(), vals)))\n",
" series.index = pd.to_datetime(series.index)\n",
" \n",
" # change index to (year, week)\n",
" return series.groupby([s.index.year, s.index.week]).sum()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 163
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Correlations"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# getting the opposite party. made difficult because oren had 'democrats' in the filenames.\n",
"opposite = {'democrat': 'republican',\n",
" 'democrats': 'republican',\n",
" 'republican': 'democrats'}\n",
"\n",
"\n",
"def create_regression_df(df):\n",
" \"\"\"creates the regression dataframe using the data in df\n",
" and other necessary controls.\n",
" \"\"\"\n",
" months = [get_month(t[0], t[1]) for t in list(reg_example_df.index)]\n",
" df['month'] = months\n",
" return df\n",
"\n",
"\n",
"def create_regression_formula(dep_var, reg_df):\n",
" \"\"\"creates the regression formula for OLS given \n",
" the dependent variable string and the dataframe that has\n",
" columns for the dependent variable and all independent variables.\n",
" \"\"\"\n",
" formula = \"{0} ~ \".format(dep_var)\n",
" \n",
" # add the rest of the variables to the formula string\n",
" cols = [c for c in reg_df.columns if c not in ['month', dep_var]]\n",
" for n, column in enumerate(cols):\n",
" if n == 0:\n",
" formula += column\n",
" else:\n",
" formula += \" + {0}\".format(column)\n",
" \n",
" formula += \" + C(month)\"\n",
" \n",
" return formula\n",
"\n",
"################################################\n",
"# equation for count/sentiment regression:\n",
"# count ~ lag_count + lag_sentiment + C(months)\n",
"# count/count regression:\n",
"# count ~ lag_count + lag_count_opp + C(months)\n",
"################################################\n",
"\n",
" \n",
"def do_count_regression(atopic, btopic, party, shift):\n",
" \"\"\"predicting the count of documents in topic for party by the\n",
" lagged count of documents in the opposing party.\n",
" shift indicates how much to lag by.\n",
" \"\"\"\n",
" opposite_party = opposite[party.lower()]\n",
" \n",
" # get the normalized counts for this party and the opposite party\n",
" counts = get_normalized_counts(atopic, party)\n",
" counts_opp = get_normalized_counts(btopic, opposite_party)\n",
" \n",
" # define the dependent variable\n",
" dep_var = '{0}_counts'.format(party)\n",
" \n",
" # create the dataframe with the correct column names based on party\n",
" reg_df = pd.DataFrame({'{0}_counts'.format(party) : counts,\n",
" '{0}_{1}_lag_counts'.format(party, atopic): counts.shift(),\n",
" '{0}_{1}_lag_{2}_counts'.format(opposite_party, btopic, shift): counts_opp.shift(shift)})\n",
" \n",
" # create the regression dataframe, adding in categorical months etc.\n",
" reg_df = create_regression_df(reg_df)\n",
" \n",
" # create formula string\n",
" formula = create_regression_formula(dep_var, reg_df)\n",
" \n",
" # run the regression\n",
" est = smf.ols(formula=formula, data=reg_df).fit()\n",
" return est\n",
"\n",
"\n",
"def do_sentiment_regression(atopic, btopic, party, sent, shift): \n",
" \"\"\"predicting the count of documents in topic for party by the lagged sentiment\n",
" of type `sent` expressed by the opposing party.\n",
" \"\"\"\n",
" opposite_party = opposite[party.lower()]\n",
" \n",
" # get the normalized counts for this party\n",
" counts = get_normalized_counts(atopic, party)\n",
" \n",
" # sentiment for opposite party on btopic\n",
" sentiment_opp = sentiment_by_party(btopic, opposite_party.title(), sent)\n",
" \n",
" # define dependent variable\n",
" dep_var = '{0}_counts'.format(party)\n",
" \n",
" # create the dataframe with the correct column names based on party\n",
" reg_df = pd.DataFrame({dep_var : counts,\n",
" '{0}_{1}_lag_counts'.format(party, atopic): counts.shift(),\n",
" '{0}_{1}_{2}_lag_{3}_sent'.format(opposite_party, sent, btopic, shift): sentiment_opp.shift(shift)}) \n",
" \n",
" # create regression dataframe and formula\n",
" reg_df = create_regression_df(reg_df)\n",
" formula = create_regression_formula(dep_var, reg_df)\n",
" \n",
" # run regression\n",
" est = smf.ols(formula=formula, data=reg_df).fit()\n",
" return est"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 405
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"## Generate results, write to disk:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import itertools\n",
"import os\n",
"\n",
"# these are the clusters we're interested in\n",
"clusters = {'energy': [38, 47, 69, 71],\n",
" 'health': [30, 51, 80]}\n",
"\n",
"# get list of all topics, flattened, then get all topic pairs\n",
"all_topics = itertools.chain(*clusters.values())\n",
"topic_pairs = itertools.permutations(all_topics, 2)\n",
"\n",
"# all our shifts\n",
"shifts = [0, 1, 2]\n",
"\n",
"\n",
"# files will be saved as:\n",
"# ../../../data/results/correlations/{0}/{1}/{2}/{3}/{4}.html\n",
"# where:\n",
"# {0} : one of {Democrats_counts, Republican_counts}, depending on the value\n",
"# being predicted\n",
"# {1} : the topic # whose counts are being predicted\n",
"# {2} : one of {sent, count}, depending on the predictor\n",
"# {3} : the lag amount used\n",
"# {4} : the predictor topic #\n",
"\n",
"\n",
"def write_to_path(content, filename):\n",
" if not os.path.exists(os.path.dirname(filename)):\n",
" os.makedirs(os.path.dirname(filename))\n",
" with open(filename, 'wb') as f:\n",
" f.write(content)\n",
"\n",
"significant = [] # results that are significant \n",
" \n",
"data_dir = '../../../data/results/correlations/'\n",
"\n",
"# count regression - iterate through all options\n",
"\n",
"for topics in topic_pairs:\n",
" for shift in shifts:\n",
" for party in ['Democrats', 'Republican']:\n",
" for kind in ['count', 'sent']:\n",
" # if it's a 'count regression', just do it\n",
" if kind == 'count':\n",
" res = do_count_regression(topics[0], topics[1], party, shift)\n",
" to_write = str(res.summary())\n",
" filename = \"{0}_counts/{1}/{2}/{3}/{4}.txt\".format(party,\n",
" topics[0],\n",
" kind,\n",
" shift,\n",
" topics[1])\n",
" filename = os.path.join(data_dir, filename)\n",
" write_to_path(to_write, filename)\n",
" \n",
" # if the predictor is significant, add it to a list\n",
" if res.pvalues.tail(1).iloc[0] < .05:\n",
" significant.append(filename)\n",
"\n",
" # otherwise, we need to iterate through the sentiment categories\n",
" # (I know this sucks)\n",
" if kind == 'sent':\n",
" for sent_type in ['pos', 'neg']:\n",
" res = do_count_regression(topics[0], topics[1], party, shift)\n",
" to_write = str(res.summary())\n",
" filename = \"{0}_counts/{1}/{2}/{3}/{4}/{5}.txt\".format(party,\n",
" topics[0],\n",
" kind,\n",
" sent_type,\n",
" shift,\n",
" topics[1])\n",
" filename = os.path.join(data_dir, filename)\n",
" write_to_path(to_write, filename)\n",
" # if the predictor is significant, add it to a list\n",
" if res.pvalues.tail(1).iloc[0] < .05:\n",
" significant.append(filename)\n",
"\n",
"# write significant results to their own file, line by line:\n",
"with open(os.path.join(data_dir, 'significant.txt'), 'wb') as f:\n",
" f.write('\\n'.join(significant))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 441
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"all_topics = itertools.chain(*clusters.values())\n",
"topic_pairs = itertools.permutations(all_topics, 2)\n",
"list(topic_pairs)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 440,
"text": [
"[(38, 47),\n",
" (38, 69),\n",
" (38, 71),\n",
" (38, 30),\n",
" (38, 51),\n",
" (38, 80),\n",
" (47, 38),\n",
" (47, 69),\n",
" (47, 71),\n",
" (47, 30),\n",
" (47, 51),\n",
" (47, 80),\n",
" (69, 38),\n",
" (69, 47),\n",
" (69, 71),\n",
" (69, 30),\n",
" (69, 51),\n",
" (69, 80),\n",
" (71, 38),\n",
" (71, 47),\n",
" (71, 69),\n",
" (71, 30),\n",
" (71, 51),\n",
" (71, 80),\n",
" (30, 38),\n",
" (30, 47),\n",
" (30, 69),\n",
" (30, 71),\n",
" (30, 51),\n",
" (30, 80),\n",
" (51, 38),\n",
" (51, 47),\n",
" (51, 69),\n",
" (51, 71),\n",
" (51, 30),\n",
" (51, 80),\n",
" (80, 38),\n",
" (80, 47),\n",
" (80, 69),\n",
" (80, 71),\n",
" (80, 30),\n",
" (80, 51)]"
]
}
],
"prompt_number": 440
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res = do_sentiment_regression(30, 30, 'Democrats', 'pos', 1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 349
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res = do_count_regression(30, 30, 'Democrats', 0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 391
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res.pvalues.tail(1).iloc[0]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 416,
"text": [
"0.40403599607963181"
]
}
],
"prompt_number": 416
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res.tvalues"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 369,
"text": [
"Intercept 1.821789e+00\n",
"C(month)[T.2] -2.537987e+00\n",
"C(month)[T.3] 1.709296e-01\n",
"C(month)[T.4] -3.261010e+00\n",
"C(month)[T.5] -4.321774e+00\n",
"C(month)[T.6] -4.500173e+00\n",
"C(month)[T.7] -4.196597e+00\n",
"C(month)[T.8] -2.763157e+00\n",
"C(month)[T.9] -1.952323e+00\n",
"C(month)[T.10] -5.307118e+00\n",
"C(month)[T.11] -6.063182e+00\n",
"C(month)[T.12] -5.222220e+00\n",
"Democrats_30_lag_counts 4.296383e+15\n",
"republican_30_lag_counts 3.273286e+00\n",
"dtype: float64"
]
}
],
"prompt_number": 369
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"res.params"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 393,
"text": [
"Intercept 0.009526\n",
"C(month)[T.2] 0.013784\n",
"C(month)[T.3] 0.027370\n",
"C(month)[T.4] 0.029769\n",
"C(month)[T.5] 0.025660\n",
"C(month)[T.6] 0.077222\n",
"C(month)[T.7] -0.016947\n",
"C(month)[T.8] 0.007844\n",
"C(month)[T.9] 0.011402\n",
"C(month)[T.10] -0.025175\n",
"C(month)[T.11] -0.010383\n",
"C(month)[T.12] 0.040636\n",
"Democrats_30_lag_counts 0.090149\n",
"republican_30_lag_0_counts 0.590797\n",
"dtype: float64"
]
}
],
"prompt_number": 393
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"HTML(res.summary().as_html())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<table class=\"simpletable\">\n",
"<caption>OLS Regression Results</caption>\n",
"<tr>\n",
" <th>Dep. Variable:</th> <td>Democrats_counts</td> <th> R-squared: </th> <td> 0.730</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Model:</th> <td>OLS</td> <th> Adj. R-squared: </th> <td> 0.712</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Method:</th> <td>Least Squares</td> <th> F-statistic: </th> <td> 40.38</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Date:</th> <td>Fri, 18 Jul 2014</td> <th> Prob (F-statistic):</th> <td>4.17e-48</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Time:</th> <td>17:30:32</td> <th> Log-Likelihood: </th> <td> 251.04</td>\n",
"</tr>\n",
"<tr>\n",
" <th>No. Observations:</th> <td> 208</td> <th> AIC: </th> <td> -474.1</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Df Residuals:</th> <td> 194</td> <th> BIC: </th> <td> -427.4</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Df Model:</th> <td> 13</td> <th> </th> <td> </td> \n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <td></td> <th>coef</th> <th>std err</th> <th>t</th> <th>P>|t|</th> <th>[95.0% Conf. Int.]</th> \n",
"</tr>\n",
"<tr>\n",
" <th>Intercept</th> <td> 0.0095</td> <td> 0.017</td> <td> 0.550</td> <td> 0.583</td> <td> -0.025 0.044</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.2]</th> <td> 0.0138</td> <td> 0.025</td> <td> 0.554</td> <td> 0.580</td> <td> -0.035 0.063</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.3]</th> <td> 0.0274</td> <td> 0.025</td> <td> 1.077</td> <td> 0.283</td> <td> -0.023 0.077</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.4]</th> <td> 0.0298</td> <td> 0.024</td> <td> 1.233</td> <td> 0.219</td> <td> -0.018 0.077</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.5]</th> <td> 0.0257</td> <td> 0.024</td> <td> 1.063</td> <td> 0.289</td> <td> -0.022 0.073</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.6]</th> <td> 0.0772</td> <td> 0.025</td> <td> 3.095</td> <td> 0.002</td> <td> 0.028 0.126</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.7]</th> <td> -0.0169</td> <td> 0.026</td> <td> -0.654</td> <td> 0.514</td> <td> -0.068 0.034</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.8]</th> <td> 0.0078</td> <td> 0.023</td> <td> 0.338</td> <td> 0.735</td> <td> -0.038 0.054</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.9]</th> <td> 0.0114</td> <td> 0.025</td> <td> 0.463</td> <td> 0.644</td> <td> -0.037 0.060</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.10]</th> <td> -0.0252</td> <td> 0.024</td> <td> -1.045</td> <td> 0.297</td> <td> -0.073 0.022</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.11]</th> <td> -0.0104</td> <td> 0.024</td> <td> -0.425</td> <td> 0.672</td> <td> -0.059 0.038</td>\n",
"</tr>\n",
"<tr>\n",
" <th>C(month)[T.12]</th> <td> 0.0406</td> <td> 0.026</td> <td> 1.566</td> <td> 0.119</td> <td> -0.011 0.092</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Democrats_30_lag_counts</th> <td> 0.0901</td> <td> 0.040</td> <td> 2.248</td> <td> 0.026</td> <td> 0.011 0.169</td>\n",
"</tr>\n",
"<tr>\n",
" <th>republican_30_lag_0_counts</th> <td> 0.5908</td> <td> 0.029</td> <td> 20.349</td> <td> 0.000</td> <td> 0.534 0.648</td>\n",
"</tr>\n",
"</table>\n",
"<table class=\"simpletable\">\n",
"<tr>\n",
" <th>Omnibus:</th> <td>80.910</td> <th> Durbin-Watson: </th> <td> 1.572</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Prob(Omnibus):</th> <td> 0.000</td> <th> Jarque-Bera (JB): </th> <td> 406.507</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Skew:</th> <td> 1.426</td> <th> Prob(JB): </th> <td>5.35e-89</td>\n",
"</tr>\n",
"<tr>\n",
" <th>Kurtosis:</th> <td> 9.226</td> <th> Cond. No. </th> <td> 12.2</td>\n",
"</tr>\n",
"</table>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 399,
"text": [
"<IPython.core.display.HTML at 0x47e5db10>"
]
}
],
"prompt_number": 399
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Example code "
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import statsmodels.formula.api as smf\n",
"from IPython.core.display import HTML\n",
"\n",
"def short_summary(est):\n",
" \"\"\"provides a nice visual summary of the model estimates\n",
" \"\"\"\n",
" return HTML(est.summary().tables[1].as_html())\n",
"\n",
"# gotta have:\n",
"# count , lag count, lag sentiment, months, week\n",
"# C(months) creates len(months) dummy variables from the categorical variable\n",
"# use * there because otherwise it just influences the intercept (?)\n",
"def ols_regression(df):\n",
" est = smf.ols(formula='count ~ lag_count + lag_sentiment + C(months)',\n",
" data=df).fit()\n",
" return short_summary(est)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 260
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment