Last active
August 29, 2015 14:07
-
-
Save alexhanna/f56f9bf2ceee2c950bab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import csv, logging, math, os.path | |
import pickle, random, re, string | |
import datetime, time | |
import numpy as np | |
import pandas as pd | |
import scipy as sp | |
## metrics | |
from sklearn import metrics | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import f1_score | |
from sklearn.utils.extmath import density | |
## vectorizers | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
## CV | |
from sklearn.cross_validation import StratifiedKFold | |
from sklearn.cross_validation import ShuffleSplit | |
from sklearn.cross_validation import cross_val_score | |
## Classifiers | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import LinearSVC | |
from sklearn.svm import SVC | |
from sklearn.neighbors import KNeighborsClassifier | |
import nltk.data | |
from nltk.tokenize.regexp import WordPunctTokenizer | |
########### | |
##### Generating training data | |
########### | |
def generateTraining(df, filename): | |
val = read.csv(filename) | |
## Classification | |
## grab random sample of 500 tweets for each candidate | |
idx = [] | |
idx += random.sample(df[df['obama'] == 1].index, 500) | |
idx += random.sample(df[df['romney'] == 1].index, 500) | |
## randomize the index | |
np.random.shuffle(idx) | |
## get the tweets | |
sub = df.ix[idx] | |
## write to disk | |
sub.to_csv("../data/" + debate + "-valence-validation.csv", index = True) | |
def repRT(row): | |
if not pd.isnull(row['rt-text']): | |
return row['rt-text'] | |
else: | |
return row['text'] | |
def determineShot(dt, lag = 0, vlag = 0): | |
## subtract an hour and add lag | |
dt = dt - datetime.timedelta(0, 3600 + vlag + lag) | |
shot = tldf[(tldf['Start'] <= dt) & (tldf['End'] >= dt)] | |
if shot: | |
return shot.index.values[0] | |
else: | |
return None | |
## debate number | |
debate = 'usprez3' | |
############################################################################### | |
##### reaction data | |
############################################################################### | |
## debate dates: | |
## 1 - 2012-10-04 (04 in GMT) | |
## VP - 2012-10-11 (12 in GMT) | |
## 2 - 2012-10-16 (17 in GMT) | |
## 3 - 2012-10-22 (23 in GMT) | |
date = '' | |
if debate == 'usprez1': | |
sfile = "../data/Debate1-biobehavioral.csv" | |
date = '04' | |
elif debate == 'usprez3': | |
sfile = "../data/Debate3-biobehavioral.csv" | |
date = '23' | |
shotdf = pd.read_csv(sfile, index_col = 2) | |
# Dataframe for matching up shots | |
tldf = pd.DataFrame({ | |
'Start': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[0], "%Y-%m-%d %H:%M:%S")), | |
'End': shotdf['Start-Stop'].apply(lambda x: datetime.datetime.strptime('2012-10-' + date + " " + x.split('-')[1], "%Y-%m-%d %H:%M:%S")) | |
}) | |
############################################################################### | |
##### sentiment classifier | |
############################################################################### | |
def benchmark(clf, X, y, feature_names = []): | |
pred = clf.predict(X) | |
if hasattr(clf, 'coef_'): | |
print("dimensionality: %d" % clf.coef_.shape[1]) | |
print("density: %f" % density(clf.coef_)) | |
if feature_names is not None: | |
print("top 20 keywords per class:") | |
nCat = len(categories) | |
nCoef = clf.coef_.shape[0] | |
if nCat > 2: | |
if nCoef == nCat: | |
for i, category in enumerate(categories): | |
top20 = np.argsort(clf.coef_[i])[-20:] | |
print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20])))) | |
else: | |
category = categories[1] | |
top20 = np.argsort(clf.coef_[0])[-20:] | |
print("%s: %s" % (category, "; ".join(map(lambda x: x.encode("utf-8"), features[top20])))) | |
print("classification report:") | |
if len(feature_names) > 0: | |
print(metrics.classification_report(y, pred, labels = labels, target_names = categories)) | |
else: | |
print(metrics.classification_report(y, pred)) | |
print("confusion matrix:") | |
print(metrics.confusion_matrix(y, pred)) | |
return pred | |
tdf = pd.read_csv('../data/' + debate + '-valence-validation-coderall.csv') | |
tdf['y'] = tdf.sentiment.apply(lambda x: int(x)) | |
tdf = tdf[tdf.y != 0] | |
## trying to replace words associated with a particular candidate | |
## with a generic | |
tdf.text = tdf.text.apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I)) | |
tdf.text = tdf.text.apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I)) | |
## sample with balanced classes | |
train_idx = random.sample(tdf.index, int(tdf.shape[0]*0.8)) | |
train_df = tdf.ix[train_idx] | |
## reset index for CV | |
train_df = train_df.reset_index() | |
cv = StratifiedKFold(train_df.y, n_folds = 3) | |
## save for last test | |
final_df = tdf.drop(train_idx) | |
labels = [-1, 1] | |
categories = ['negative', 'positive'] | |
results = {'f1':[], 'p': [], 'r': [], 'c': [], 'cn': [], 'pen': []} | |
for train, test in cv: | |
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.5, ngram_range = (1,2), stop_words='english') | |
X_train = vectorizer.fit_transform(list(train_df.ix[train].text)) | |
X_test = vectorizer.transform(list(train_df.ix[test].text)) | |
features = np.asarray(vectorizer.get_feature_names()) | |
y_train = train_df.ix[train].y | |
y_test = train_df.ix[test].y | |
clf = LogisticRegression(class_weight="auto", C = 0.01) | |
clf.fit(X_train, y_train) | |
pred = benchmark(clf, X_test, y_test, features) | |
## best for this dataset | |
clf = LogisticRegression(class_weight="auto", C = 10**-2) | |
X_final = vectorizer.transform(list(final_df.text)) | |
pred = benchmark(clf, X_final, final_df.y, features) | |
############################################################################### | |
##### gardenhose | |
############################################################################### | |
## all debates ran 9-10:30 PM EST | |
## http://www.huffingtonpost.com/2012/10/02/presidential-debate-schedule-2012_n_1931082.html | |
if debate == 'usprez1': | |
gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", "user-userlevel", | |
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name", "rt-user-userlevel"] | |
df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121003-usprez.csv", | |
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1]) | |
startDt = datetime.datetime(2012, 10, 04, 1, 0, 0) | |
endDt = datetime.datetime(2012, 10, 04, 2, 35, 0) | |
lag = 90 | |
elif debate == 'usprez3': | |
gh_cols = ["id_str", "created_at", "text", "user-id_str", "user-name", "user-screen_name", | |
"rt-id_str", "rt-created_at", "rt-text", "rt-user-id_str", "rt-user-name", "rt-user-screen_name"] | |
df = pd.read_csv("/project/hanna/elex2012/debates/gh.20121022-usprez3.csv", | |
sep = "\t", quoting = csv.QUOTE_NONE, index_col = False, names = gh_cols, parse_dates = [1], | |
na_values = ["\N"], keep_default_na = True, error_bad_lines = False) | |
startDt = datetime.datetime(2012, 10, 23, 1, 0, 0) | |
endDt = datetime.datetime(2012, 10, 23, 2, 35, 0) | |
lag = 120 | |
## filter to debate, sort | |
df = df.loc[(df['created_at'] >= startDt) & (df['created_at'] <= endDt)] | |
df = df.sort('created_at') | |
## move RT in main text because of convenience | |
df['text'] = df.apply(repRT, axis = 1) | |
## lowercase | |
df['text'] = df['text'].apply(str.lower) | |
############ | |
##### Memes | |
############ | |
## Debate 3 memes | |
# O: "The 1980's are now calling to ask for their foreign policy back...": | |
# Start: 00:09:12 | |
# FT: 00:11:10 | |
#df['1980'] = df['text'].apply(lambda x: 1 if '1980' in x else 0) | |
# R: "Attacking me is not an agenda..." | |
# Start: 00:11:15 | |
# FT: 01:12:53 | |
#df['attack'] = df['text'].apply(lambda x: 1 if 'attacking me' in x else 0) | |
# O: "Well, governor, we also have fewer horses and bayonets, because the nature of our military's changed..." | |
# Start: 00:42:19 | |
# FT: 00:44:04 | |
#df['hnb'] = df['text'].apply(lambda x: 1 if 'horses and bayonets' in x else 0) | |
# Romney's "I love teachers...": | |
# Start: 01:26:24 | |
# FT: 01:29:00 | |
#df['teach'] = df['text'].apply(lambda x: 1 if 'i love teachers' in x else 0) | |
## I'm thinking a two minute lag will be about right for debate 3 | |
############### | |
##### end memes | |
############### | |
## Index tweets that mention only Obama or Romney | |
df['obama'] = df['text'].apply(lambda x: 1 if 'obama' in x and 'romney' not in x else 0) | |
df['romney'] = df['text'].apply(lambda x: 1 if 'obama' not in x and 'romney' in x else 0) | |
## then replace candidate mentions with lname/fname | |
## NOTE: only do this in the classifying step | |
df['text'] = df['text'].apply(lambda x: re.sub(r"obama|romney(ryan)?", "lname", x, flags = re.I)) | |
df['text'] = df['text'].apply(lambda x: re.sub(r"mitt|barack", "fname", x, flags = re.I)) | |
## vectorize text and produce sentiment vector | |
df['score'] = clf.predict(vectorizer.transform(df['text'])) | |
## bin each tweet by shot | |
df['Shot'] = df['created_at'].apply(determineShot, args = [0, lag]) | |
#df['Shot15'] = df['created_at'].apply(determineShot, args = [15, lag]) | |
#df['Shot30'] = df['created_at'].apply(determineShot, args = [30, lag]) | |
#df['Shot45'] = df['created_at'].apply(determineShot, args = [45, lag]) | |
## assign candidate scores | |
df['O_score'] = df.apply(lambda x: x['score'] if x['obama'] else None, axis = 1) | |
df['R_score'] = df.apply(lambda x: x['score'] if x['romney'] else None, axis = 1) | |
############ | |
## Volume and sentiment. No need to generate these for the | |
## regression analysis dataset | |
############ | |
##### volume by minute | |
df['date'] = np.array(df['created_at'], dtype="datetime64[m]") | |
grouped = df.groupby('date') | |
ovol = grouped['obama'].agg([np.sum]) | |
rvol = grouped['romney'].agg([np.sum]) | |
ovol['person'] = 'Obama' | |
rvol['person'] = 'Romney' | |
out = ovol.append(rvol) | |
out.to_csv("../data/gh.%s-volume.csv" % debate) | |
##### sentiment by minute | |
osent = grouped['O_score'].agg([np.mean, np.std]) | |
rsent = grouped['R_score'].agg([np.mean, np.std]) | |
osent['person'] = 'Obama' | |
rsent['person'] = 'Romney' | |
out = osent.append(rsent) | |
out.to_csv("../data/gh.%s-sentiment.csv" % debate) | |
## Group by different shots with lags and volume | |
for s in ['Shot']: #, 'Shot15', 'Shot30', 'Shot45']: | |
grouped = df.groupby(s) | |
oscore = grouped['O_score'].agg([np.mean]) | |
oscore.columns = ['GH_Osentiment_' + s] | |
ovol = grouped['obama'].agg([np.sum]) | |
ovol.columns = ['GH_Ovolume_' + s] | |
rscore = grouped['R_score'].agg([np.mean]) | |
rscore.columns = ['GH_Rsentiment_' + s] | |
rvol = grouped['romney'].agg([np.sum]) | |
rvol.columns = ['GH_Rvolume_' + s] | |
## join them all together | |
shotdf = shotdf.merge(oscore, left_index = True, right_index = True) | |
shotdf = shotdf.merge(ovol, left_index = True, right_index = True) | |
shotdf = shotdf.merge(rscore, left_index = True, right_index = True) | |
shotdf = shotdf.merge(rvol, left_index = True, right_index = True) | |
shotdf.to_csv("../data/" + debate + "-biobehavioral-twitterstats.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment