Created
May 30, 2014 04:51
-
-
Save thiagomarzagao/406be950a4fb67af3bde to your computer and use it in GitHub Desktop.
Code used for my "Automated Democracy Scores" paper.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import time | |
import pickle | |
import numpy as np | |
import pandas as pd | |
# set paths | |
basepath = '/fs/lustre/osu6994/hdf5/' | |
relfreq_rows = basepath + 'relfreq_rows/' | |
relfreq_cols = basepath + 'relfreq_cols/' | |
absfreq_cols = basepath + 'absfreq_cols/' | |
udsfile = basepath + 'uds.csv' | |
polityfile = basepath + 'polity.csv' | |
wordsfile = basepath + 'words' | |
batches = basepath + 'batches/' | |
output = basepath + 'output/' | |
# set reference years | |
refyears = [1992] | |
# create Ar | |
def create_Ar(reffile, refyears): | |
if reffile == udsfile: | |
full = pd.read_csv(udsfile, usecols = [0, 1, 3]) | |
elif reffile == polityfile: | |
full = pd.read_csv(polityfile, usecols = [0, 1, 2]) | |
ref = full[full['year'] > 1991] | |
Ar = {ref.iat[row, 0] + str(ref.iat[row, 1]): ref.iat[row, 2] for row in range(len(ref)) if ref.iat[row, 1] in refyears} | |
Ar = pd.DataFrame(Ar.items(), columns = ['doc', 'docscore']) | |
Ar.set_index('doc', inplace = True) | |
return Ar | |
# compute Sw (and save to file) | |
def compute_Sw(Ar, relfreq_rows): | |
Sw = pd.DataFrame() | |
for file in [file for file in os.listdir(relfreq_rows) if '.h5' in file]: | |
store = pd.HDFStore(relfreq_rows + file) | |
Fwr = store['freq'] | |
for col in Fwr.columns: | |
if col != 'word' and col not in Ar.index: | |
del Fwr[col] | |
sumFwr = Fwr.sum(axis = 1) | |
Pwr = Fwr.T / sumFwr | |
Pwr = Pwr.T | |
Pwr = Pwr.dropna() | |
Sw_r = Pwr.dot(Ar) | |
Sw_r.columns = ['wordscore'] | |
Sw = pd.concat([Sw, Sw_r]) | |
store.close() | |
Sw.to_csv(output + 'wordscores.csv', index = True, index_label = 'word') | |
return Sw | |
# load column of words | |
def load_words(wordsfile): | |
f = open(wordsfile, mode = 'rb') | |
words = pickle.load(f) | |
f.close() | |
return words | |
# get frequencies | |
def get_freq(path, file, Ar, words): | |
store = pd.HDFStore(path + file) | |
freq = store['freq'] | |
ref_index = set(Ar.index) | |
for case in freq.columns: | |
if int(case[-4:]) in refyears: | |
del freq[case] | |
freq['word'] = words | |
freq.set_index('word', inplace = True) | |
store.close() | |
return freq | |
# compute Sv | |
def compute_Sv(Fwv, Sw): | |
tempjoin1 = pd.merge(Fwv, Sw, how = 'inner', left_index = True, right_index = True, sort = False) | |
k = len(tempjoin1.columns) - 1 | |
Sv = pd.DataFrame(tempjoin1.iloc[:, :k].T.dot(tempjoin1['wordscore'])) | |
Sv.columns = ['docscore'] | |
return tempjoin1, Sv | |
# compute Vv | |
def compute_Vv(tempjoin1, Sv): | |
cleanSw = pd.DataFrame(tempjoin1.wordscore) | |
cleanSw.columns = ['score'] | |
cleanFwv = tempjoin1 | |
del cleanFwv['wordscore'] | |
Vv = (cleanFwv * np.square((np.array(cleanSw) - np.array(Sv.T)))).sum(axis = 0) | |
return Vv | |
# compute confidence intervals | |
def compute_CI(virgin_absfreq, Sw, Vv, Sv): | |
tempjoin2 = pd.merge(virgin_absfreq, Sw, how = 'inner', left_index = True, right_index = True, sort = False) | |
del tempjoin2['wordscore'] | |
N = tempjoin2.sum(axis = 0) | |
std_error = np.sqrt(Vv / N) | |
lower = np.array(Sv).flatten() - np.array((1.96 * std_error)) | |
upper = np.array(Sv).flatten() + np.array((1.96 * std_error)) | |
return lower, upper | |
# stack Sv and CIs and save batch to disk | |
def save_SvCI(Sv, lower, upper, counter): | |
SvCI = Sv | |
SvCI['lower'] = lower | |
SvCI['upper'] = upper | |
SvCI.to_csv(batches + 'SvCI_batch_' + str(counter) + '.csv', header = False) | |
# compute SvCI batches and save to disk | |
def compute_SvCI(reffile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols): | |
Ar = create_Ar(reffile, refyears) | |
words = load_words(wordsfile) | |
Sw = compute_Sw(Ar, relfreq_rows) | |
relfiles = [file for file in os.listdir(relfreq_cols) if '.h5' in file] | |
absfiles = [file for file in os.listdir(absfreq_cols) if '.h5' in file] | |
counter = 0 | |
for relfile, absfile in zip(relfiles, absfiles): | |
counter += 1 | |
Fwv = get_freq(relfreq_cols, relfile, Ar, words) | |
tempjoin1, Sv = compute_Sv(Fwv, Sw) | |
Vv = compute_Vv(tempjoin1, Sv) | |
virgin_absfreq = get_freq(absfreq_cols, absfile, Ar, words) | |
lower, upper = compute_CI(virgin_absfreq, Sw, Vv, Sv) | |
save_SvCI(Sv, lower, upper, counter) | |
# consolidate all SvCI batches into one file | |
def consolidate_SvCI(output): | |
fullSv = open(output + 'SvCI.csv', mode = 'w') | |
fullSv.write(',Sv,lower,upper\n') | |
for file in os.listdir(batches): | |
if 'SvCI_batch_' in file: | |
newSv = open(batches + file, mode = 'r').read() | |
fullSv.write(newSv) | |
fullSv.close() | |
# compute transformed estimates | |
def compute_Svt(output, reffile, refyears): | |
Ar = create_Ar(reffile, refyears) | |
Sv = pd.read_csv(output + 'SvCI.csv', usecols = [0, 1], index_col = [0]) | |
lower = pd.read_csv(output + 'SvCI.csv', usecols = [0, 2], index_col = [0]) | |
upper = pd.read_csv(output + 'SvCI.csv', usecols = [0, 3], index_col = [0]) | |
Sv_t = (Sv - Sv.mean()) * (Ar.std()[0] / Sv.std()[0]) + Sv.mean() | |
lower_t = (np.array(lower) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean()) | |
upper_t = (np.array(upper) - np.array(Sv.mean())) * np.array((Ar.std()[0] / Sv.std()[0])) + np.array(Sv.mean()) | |
Sv_t['lower'] = lower_t | |
Sv_t['upper'] = upper_t | |
Sv_t.to_csv(output + 'Sv_t.csv') | |
return Sv_t | |
compute_SvCI(udsfile, refyears, wordsfile, relfreq_rows, relfreq_cols, absfreq_cols) | |
consolidate_SvCI(output) | |
compute_Svt(output, udsfile, refyears) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment