Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Last active August 29, 2015 14:07
Show Gist options
  • Save alexhanna/f56f9bf2ceee2c950bab to your computer and use it in GitHub Desktop.
Save alexhanna/f56f9bf2ceee2c950bab to your computer and use it in GitHub Desktop.
from __future__ import division
import csv, logging, math, os.path
import pickle, random, re, string
import datetime, time
import numpy as np
import pandas as pd
import scipy as sp
## metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.utils.extmath import density
## vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer
## CV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import ShuffleSplit
from sklearn.cross_validation import cross_val_score
## Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import nltk.data
from nltk.tokenize.regexp import WordPunctTokenizer
###########
##### Generating training data
###########
def generateTraining(df, filename):
val = read.csv(filename)
## Classification
## grab random sample of 500 tweets for each candidate
idx = []
idx += random.sample(df[df['obama'] == 1].index, 500)
idx += random.sample(df[df['romney'] == 1].index, 500)
## randomize the index
np.random.shuffle(idx)
## get the tweets
sub = df.ix[idx]
## write to disk
sub.to_csv("../data/" + debate + "-valence-validation.csv", index = True)
def repRT(row):
if not pd.isnull(row['rt-text']):
return row['rt-text']
else:
return row['text']
def determineShot(dt, lag = 0, vlag = 0):
## subtract an hour and add lag
dt = dt - datetime.timedelta(0, 3600 + vlag + lag)
shot = tldf[(tldf['Start'] <= dt) & (tldf['End'] >= dt)]
if shot:
return shot.index.values[0]
else:
return None
## debate number
debate = 'usprez3'
###############################################################################
##### reaction data
###############################################################################
## debate dates:
## 1 - 2012-10-04 (04 in GMT)
## VP - 2012-10-11 (12 in GMT)
## 2 - 2012-10-16 (17 in GMT)
## 3 - 2012-10-22 (23 in GMT)
date = ''
if debate == 'usprez1':
sfile = "../data/Debate1-biobehavioral.csv"
date = '04'
elif debate == 'usprez3':
sfile = "../data/Debate3-biobehavioral.csv"
date = '23'
shotdf = pd.read_csv(sfile, index_col = 2)
# Dataframe for matching up shots
tldf = pd.DataFrame({
'Start': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[0], "%Y-%m-%d %H:%M:%S")),
'End': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[1], "%Y-%m-%d %H:%M:%S"))
})
###############################################################################
##### sentiment classifier
###############################################################################
def benchmark(clf, X, y, feature_names = []):
pred = clf.predict(X)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
if feature_names is not None:
print("top 20 keywords per class:")
nCat = len(categories)
nCoef = clf.coef_.shape[0]
if nCat > 2:
if nCoef == nCat:
for i, category in enumerate(categories):
top20 = np.argsort(clf.coef_[i])[-20:]
print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))
else:
category = categories[1]
top20 = np.argsort(clf.coef_[0])[-20:]
print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20]))))
print("classification report:")
if len(feature_names) > 0:
print(metrics.classification_report(y, pred, labels = labels, target_names = categories))
else:
print(metrics.classification_report(y, pred))
print("confusion matrix:")
print(metrics.confusion_matrix(y, pred))
return pred
tdf = pd.read_csv('../data/' + debate + '-valence-validation-coderall.csv')
tdf['y'] = tdf.sentiment.apply(lambda x: int(x))
tdf = tdf[tdf.y != 0]
## trying to replace words associated with a particular candidate
## with a generic
tdf.text = tdf.text.apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I))
tdf.text = tdf.text.apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I))
## sample with balanced classes
train_idx = random.sample(tdf.index, int(tdf.shape[0]*0.8))
train_df = tdf.ix[train_idx]
## reset index for CV
train_df = train_df.reset_index()
cv = StratifiedKFold(train_df.y, n_folds = 3)
## save for last test
final_df = tdf.drop(train_idx)
labels = [-1, 1]
categories = ['negative', 'positive']
results = {'f1':[], 'p': [], 'r': [], 'c': [], 'cn': [], 'pen': []}
for train, test in cv:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.5, ngram_range = (1,2), stop_words='english')
X_train = vectorizer.fit_transform(list(train_df.ix[train].text))
X_test = vectorizer.transform(list(train_df.ix[test].text))
features = np.asarray(vectorizer.get_feature_names())
y_train = train_df.ix[train].y
y_test = train_df.ix[test].y
clf = LogisticRegression(class_weight="auto", C = 0.01)
clf.fit(X_train, y_train)
pred = benchmark(clf, X_test, y_test, features)
## best for this dataset
clf = LogisticRegression(class_weight="auto", C = 10**-2)
X_final = vectorizer.transform(list(final_df.text))
pred = benchmark(clf, X_final, final_df.y, features)
###############################################################################
##### gardenhose
###############################################################################
## all debates ran 9-10:30 PM EST
## http://www.huffingtonpost.com/2012/10/02/presidential-debate-schedule-2012_n_1931082.html
if debate == 'usprez1':
gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", "user-userlevel",
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name", "rt-user-userlevel"]
df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121003-usprez.csv",
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1])
startDt = datetime.datetime(2012, 10, 04, 1, 0, 0)
endDt = datetime.datetime(2012, 10, 04, 2, 35, 0)
lag = 90
elif debate == 'usprez3':
gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name",
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name"]
df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121022-usprez3.csv",
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1],
na_values = ["\N"], keep_default_na = True, error_bad_lines = False)
startDt = datetime.datetime(2012, 10, 23, 1, 0, 0)
endDt = datetime.datetime(2012, 10, 23, 2, 35, 0)
lag = 120
## filter to debate, sort
df = df.loc[(df['created_at'] >= startDt) & (df['created_at'] <= endDt)]
df = df.sort('created_at')
## move RT in main text because of convenience
df['text'] = df.apply(repRT, axis = 1)
## lowercase
df['text'] = df['text'].apply(str.lower)
############
##### Memes
############
## Debate 3 memes
# O: "The 1980's are now calling to ask for their foreign policy back...":
# Start: 00:09:12
# FT: 00:11:10
#df['1980'] = df['text'].apply(lambda x: 1 if '1980' in x else 0)
# R: "Attacking me is not an agenda..."
# Start: 00:11:15
# FT: 01:12:53
#df['attack'] = df['text'].apply(lambda x: 1 if 'attacking me' in x else 0)
# O: "Well, governor, we also have fewer horses and bayonets, because the nature of our military's changed..."
# Start: 00:42:19
# FT: 00:44:04
#df['hnb'] = df['text'].apply(lambda x: 1 if 'horses and bayonets' in x else 0)
# Romney's "I love teachers...":
# Start: 01:26:24
# FT: 01:29:00
#df['teach'] = df['text'].apply(lambda x: 1 if 'i love teachers' in x else 0)
## I'm thinking a two minute lag will be about right for debate 3
###############
##### end memes
###############
## Index tweets that mention only Obama or Romney
df['obama'] = df['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0)
df['romney'] = df['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0)
## then replace candidate mentions with lname/fname
## NOTE: only do this in the classifying step
df['text'] = df['text'].apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I))
df['text'] = df['text'].apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I))
## vectorize text and produce sentiment vector
df['score'] = clf.predict(vectorizer.transform(df['text']))
## bin each tweet by shot
df['Shot'] = df['created_at'].apply(determineShot, args = [0, lag])
#df['Shot15'] = df['created_at'].apply(determineShot, args = [15, lag])
#df['Shot30'] = df['created_at'].apply(determineShot, args = [30, lag])
#df['Shot45'] = df['created_at'].apply(determineShot, args = [45, lag])
## assign candidate scores
df['O_score'] = df.apply(lambda x: x['score'] if x['obama'] else None, axis = 1)
df['R_score'] = df.apply(lambda x: x['score'] if x['romney'] else None, axis = 1)
############
## Volume and sentiment. No need to generate these for the
## regression analysis dataset
############
##### volume by minute
df['date'] = np.array(df['created_at'], dtype="datetime64[m]")
grouped = df.groupby('date')
ovol = grouped['obama'].agg([np.sum])
rvol = grouped['romney'].agg([np.sum])
ovol['person'] = 'Obama'
rvol['person'] = 'Romney'
out = ovol.append(rvol)
out.to_csv("../data/gh.%s-volume.csv" % debate)
##### sentiment by minute
osent = grouped['O_score'].agg([np.mean, np.std])
rsent = grouped['R_score'].agg([np.mean, np.std])
osent['person'] = 'Obama'
rsent['person'] = 'Romney'
out = osent.append(rsent)
out.to_csv("../data/gh.%s-sentiment.csv" % debate)
## Group by different shots with lags and volume
for s in ['Shot']: #, 'Shot15', 'Shot30', 'Shot45']:
grouped = df.groupby(s)
oscore = grouped['O_score'].agg([np.mean])
oscore.columns = ['GH_Osentiment_' + s]
ovol = grouped['obama'].agg([np.sum])
ovol.columns = ['GH_Ovolume_' + s]
rscore = grouped['R_score'].agg([np.mean])
rscore.columns = ['GH_Rsentiment_' + s]
rvol = grouped['romney'].agg([np.sum])
rvol.columns = ['GH_Rvolume_' + s]
## join them all together
shotdf = shotdf.merge(oscore, left_index = True, right_index = True)
shotdf = shotdf.merge(ovol, left_index = True, right_index = True)
shotdf = shotdf.merge(rscore, left_index = True, right_index = True)
shotdf = shotdf.merge(rvol, left_index = True, right_index = True)
shotdf.to_csv("../data/" + debate + "-biobehavioral-twitterstats.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment