Last active
December 18, 2015 07:19
-
-
Save thiagomarzagao/5746292 to your computer and use it in GitHub Desktop.
The Python script below implements the ‘wordscores’ algorithm (see Laver, M., Benoit, K., Garry, J. Extracting policy positions from political texts using words as data. American Political Science Review, 97(2), 2003, pp. 311-331). It takes as inputs word-frequency matrices. These matrices must be in CSV format. The first column must contain the…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### WORDSCORES (LBG-2003) | |
### author: Thiago Marzagao | |
### contact: marzagao ddott 1 at osu ddott edu | |
import os | |
import numpy as np | |
import pandas as pd | |
ipath = '/Users/username/inputdata/' # folder containing the CSV files | |
opath = '/Users/username/outputdata/' # folder where output will be saved | |
# hardcode your reference cases and their scores | |
Ar = pd.DataFrame({'referenceCase1': 1.1, # these are just examples | |
'referenceCase2': 3.7, | |
'referenceCase3': 8.2}, | |
index = ['score']) | |
# create function to load and merge data | |
def loadData(caseSet, path, cols): | |
''' | |
iterable, string, dict -> pandas.DataFrame | |
''' | |
output = pd.DataFrame(columns = ['word']) | |
for case in caseSet: | |
# check if case is casename or filename | |
if '.csv' not in case: | |
case = case + '.csv' | |
# load new data file | |
newData = pd.read_csv(path + case, | |
usecols = [col for col in cols.keys()], | |
dtype = cols, | |
names = ['word', case.replace('.csv', '')], | |
header = None) | |
# merge with previous data | |
output = pd.merge(output, newData, on = 'word', how = 'outer') | |
output = output.fillna(0) # kill NaNs | |
return output | |
# load reference data | |
Fwr = loadData(Ar.keys(), ipath, {0: 'S30', 2: 'float'}) | |
# compute p(r|w) = f_wr / sum(f_wr)_{for all r} | |
Pwr = Fwr.iloc[:, 1:].div(Fwr.sum(axis = 1), axis = 0) | |
# compute Sw and save to file | |
Sw = pd.DataFrame(Fwr.word) | |
Sw['score'] = Pwr.dot(Ar.T) | |
Sw.to_csv(opath + 'wordscores.csv', index = False) | |
# load virgin data | |
virginSet = [file for file in os.listdir(ipath) | |
if file.replace('.csv', '') not in Ar.keys()] | |
virginAbsFreq = loadData(virginSet, ipath, {0: 'S30', 1: 'int'}) | |
Fwv = loadData(virginSet, ipath, {0: 'S30', 2: 'float'}) | |
# 1:1 merge Fwv with Sw (to discard all disjoint words) | |
temp = pd.merge(Fwv, Sw, on = 'word', how = 'inner') | |
# split filtered Sw | |
cleanSw = pd.DataFrame(temp.score) | |
# clean up filtered Fwv | |
del temp['word'] | |
del temp['score'] | |
cleanFwv = temp | |
# compute Sv = sum(Fwv * Sw)_{for all w} | |
Sv = cleanFwv.T.dot(cleanSw) | |
# compute transformed Sv | |
Sv_t = (Sv - Sv.mean()) * (Ar.T.std() / Sv.std()) + Sv.mean() | |
# compute Vv | |
Vv = (cleanFwv * np.square((np.array(cleanSw) | |
- np.array(Sv.T)))).sum(axis = 0) | |
# 1:1 merge absolute frequencies with Sw (to discard all disjoint words) | |
temp = pd.merge(virginAbsFreq, Sw, on = 'word', how = 'inner') | |
# compute N | |
del temp['word'] | |
del temp['score'] | |
N = temp.sum(axis = 0) | |
# compute standard errors and confidence intervals | |
std_error = np.sqrt(Vv / N) | |
lower = np.array(Sv).flatten() - np.array((1.96 * std_error)) | |
upper = np.array(Sv).flatten() + np.array((1.96 * std_error)) | |
# compute transformed confidence intervals | |
lower_t = (np.array(lower) - np.array(Sv.mean())) \ | |
* np.array((Ar.T.std() / Sv.std())) \ | |
+ np.array(Sv.mean()) | |
upper_t = (np.array(upper) - np.array(Sv.mean())) \ | |
* np.array((Ar.T.std() / Sv.std())) \ | |
+ np.array(Sv.mean()) | |
# print everything | |
print '' | |
print 'Original scores (w/ 95CI):' | |
print '' | |
Sv['lower'] = lower | |
Sv['upper'] = upper | |
print Sv | |
print '' | |
print 'Transformed scores (w/ 95CI):' | |
print '' | |
Sv_t['lower'] = lower_t | |
Sv_t['upper'] = upper_t | |
print Sv_t | |
print '' | |
# save transformed estimates to file | |
Sv_t.to_csv(opath + 'virginScores.csv', index_label = 'case') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment