Created
May 19, 2015 13:20
-
-
Save interrogator/cf42d9e3faf44a2be55b to your computer and use it in GitHub Desktop.
eugener-code-and-output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#eugene script | |
def eugener(path = 'data/nyt/earlylate', | |
regex = r'(?i)\brisk', | |
depth = 5, | |
top = 10, | |
remove_stopwords = False): | |
""" | |
get most frequent words in corpus path to left and right | |
""" | |
import os | |
import nltk | |
import re | |
from StringIO import StringIO | |
from collections import Counter | |
import pandas as pd | |
from dictionaries.stopwords import stopwords as stopwords | |
# get list of subcorpora | |
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] | |
# define risk word | |
# place for our output | |
dfs = {} | |
for corpus in dirs: | |
print 'Doing %s ... ' % corpus | |
# search the corpus for whole sents containing risk word | |
pathed = os.path.join(path, corpus) | |
results = !./tregex.sh -t -w "/$regex/" $pathed | |
# remove blank lines, etc | |
results = [result for result in results if re.search(regex, result)] | |
# lowercase | |
results = [result.lower() for result in results] | |
# make into single string | |
results = '\n'.join(results) | |
# tokenise the string | |
results = nltk.word_tokenize(results) | |
# a place for info about each corpus | |
# go left and right depth times | |
all_words = [] | |
dicts = [] | |
for i in range(-depth, (depth + 1)): | |
newdict = Counter() | |
# exclude the 0 iteration, because it will just be risk words | |
print 'Depth: %d...' % i | |
matching = [] | |
# go through each token | |
for index, token in enumerate(results): | |
# if token matches risk expression | |
if re.search(regex, token): | |
# get the word at depth index | |
# try statement for cases where the target word index isn't there | |
try: | |
if i < 0: | |
num = index - abs(i) | |
matching.append(results[num]) | |
else: | |
matching.append(results[index + i]) | |
except: | |
pass | |
# tally results | |
counted = Counter(matching) | |
# remove punctuation etc | |
for key in counted: | |
if key.isalnum(): | |
#if key not in stopwords: | |
if remove_stopwords: | |
if key not in stopwords: | |
newdict[key] = counted[key] | |
else: | |
newdict[key] = counted[key] | |
for w in counted.keys(): | |
all_words.append(w) | |
#top_tokens = newdict.most_common(top) | |
dicts.append(newdict) | |
# make pandas series | |
print 'Making DataFrame ... ' | |
sers = [] | |
for word in list(set(all_words)): | |
series = [dct[word] for dct in dicts] | |
series.append(sum([dct[word] for dct in dicts])) | |
index_names = range(-depth, (depth + 1)) | |
index_names.append('Total') | |
ser = pd.Series(series, index = index_names) | |
ser.name = word | |
sers.append(ser) | |
# concatenate series | |
df = pd.concat(sers, axis=1) | |
# remove zero depth | |
#df = df.drop(df.index[depth]) | |
# sort by total | |
tot = df.ix['Total'] | |
df = df[tot.argsort()[::-1]] | |
# just top entries | |
df = pd.DataFrame(df[list(df.columns)[:top]]) | |
#transpose | |
dfs[corpus] = df.T | |
return dfs | |
data = eugener(depth = 7, top = 25, remove_stopwords=True) | |
import pandas as pd | |
pd.set_option('display.max_columns', 500) | |
pd.set_option('display.width', 1000) | |
print data['1995'] | |
# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total | |
# risk 73 59 39 30 21 6 0 4749 0 11 23 26 33 61 72 5203 | |
# risks 32 22 11 13 6 3 0 1611 0 0 5 12 15 25 30 1785 | |
# risky 12 8 8 5 4 0 0 684 0 0 1 8 5 9 7 751 | |
# high 8 8 7 6 4 7 127 0 2 4 8 6 7 5 8 207 | |
# people 16 17 13 22 24 16 1 0 5 10 24 7 20 13 14 202 | |
# cancer 4 4 3 1 3 0 47 0 0 26 66 16 9 13 5 197 | |
# health 6 6 5 10 9 1 91 0 4 14 14 6 11 7 10 194 | |
# heart 3 3 2 1 2 17 6 0 1 76 38 14 5 5 9 182 | |
# disease 10 2 4 1 4 0 16 0 0 8 54 42 18 8 6 173 | |
# risking 1 1 1 0 0 0 0 157 0 0 0 1 2 1 2 166 | |
# women 6 12 17 22 14 7 0 0 8 5 11 21 20 13 7 163 | |
# risked 0 0 0 0 0 0 0 141 0 0 0 3 0 3 3 150 | |
# political 6 6 1 8 8 0 65 0 7 10 6 6 5 5 11 144 | |
# reduce 3 2 1 3 7 93 22 0 0 0 3 0 1 2 2 139 | |
# percent 17 7 13 11 12 15 6 0 0 0 9 6 10 17 12 135 | |
# business 8 3 9 10 10 0 5 0 52 2 11 6 6 8 4 134 | |
# factors 5 1 3 6 2 0 0 0 110 1 0 3 0 2 1 134 | |
# losing 2 1 3 2 0 0 0 0 84 26 1 1 2 3 3 128 | |
# increased 2 2 2 3 1 18 84 0 0 6 1 3 2 1 2 127 | |
# investors 11 14 14 13 19 1 2 0 4 9 12 8 6 5 6 124 | |
# greater 4 6 5 4 6 6 63 0 2 3 2 7 7 5 3 123 | |
# death 1 3 2 2 1 0 6 0 7 29 29 16 7 8 2 113 | |
# financial 2 7 6 6 2 1 33 0 5 10 6 9 7 6 7 107 | |
# american 18 8 9 5 11 2 1 0 7 11 8 8 7 6 4 105 | |
# lives 1 6 6 9 2 7 0 0 3 54 8 5 2 0 2 105 | |
print data['2013'] | |
# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total | |
# risk 93 92 58 47 37 16 1 6591 0 11 33 49 48 77 96 7249 | |
# risks 32 24 12 17 24 3 0 2217 1 2 22 12 23 29 26 2444 | |
# risky 4 4 8 1 3 2 0 391 0 2 2 3 6 4 2 432 | |
# cancer 10 9 20 16 3 3 39 0 1 35 113 38 29 19 11 346 | |
# people 18 24 18 43 43 32 1 0 10 19 23 17 27 22 24 321 | |
# heart 8 15 14 8 3 18 14 0 0 119 32 14 10 9 5 269 | |
# high 9 7 6 6 8 8 136 0 0 17 13 12 13 7 22 264 | |
# increased 3 6 5 1 3 40 160 0 0 3 4 8 5 7 4 249 | |
# disease 7 9 13 7 4 2 21 0 0 9 102 41 17 7 9 248 | |
# health 20 7 13 9 10 7 110 0 1 11 17 9 11 6 7 238 | |
# risked 0 0 0 0 0 0 0 213 0 0 3 2 2 2 6 228 | |
# percent 12 17 9 14 5 44 38 0 0 0 11 12 20 15 26 223 | |
# reduce 3 2 2 10 10 125 32 0 0 0 0 1 5 2 0 192 | |
# factors 2 1 8 4 6 1 0 0 144 0 3 0 4 6 3 182 | |
# financial 8 10 9 15 6 4 51 0 5 10 21 6 16 14 7 182 | |
# higher 12 1 9 2 16 8 82 0 2 8 5 7 12 11 2 177 | |
# risking 4 2 1 0 0 0 0 165 0 0 0 0 1 3 0 176 | |
# potential 3 5 7 6 3 20 73 0 1 14 10 8 8 5 5 168 | |
# women 14 8 23 17 7 13 0 0 5 10 8 9 16 14 15 159 | |
# bank 15 11 13 9 15 18 5 0 1 7 12 16 10 10 13 155 | |
# greater 3 3 5 4 7 20 85 0 1 5 3 5 4 3 4 152 | |
# political 5 3 5 7 4 0 90 0 0 10 5 8 1 7 6 151 | |
# banks 10 17 13 14 11 8 1 0 0 12 10 17 17 11 9 150 | |
# management 4 6 1 0 2 4 0 0 100 0 12 5 5 3 6 148 | |
# breast 4 11 10 3 1 14 0 0 0 60 21 6 7 5 5 147 | |
print data['2013'].to_latex() | |
# \begin{tabular}{lrrrrrrrrrrrrrrrr} | |
# \toprule | |
# {} & -7 & -6 & -5 & -4 & -3 & -2 & -1 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & Total \\ | |
# \midrule | |
# risk & 93 & 92 & 58 & 47 & 37 & 16 & 1 & 6591 & 0 & 11 & 33 & 49 & 48 & 77 & 96 & 7249 \\ | |
# risks & 32 & 24 & 12 & 17 & 24 & 3 & 0 & 2217 & 1 & 2 & 22 & 12 & 23 & 29 & 26 & 2444 \\ | |
# risky & 4 & 4 & 8 & 1 & 3 & 2 & 0 & 391 & 0 & 2 & 2 & 3 & 6 & 4 & 2 & 432 \\ | |
# cancer & 10 & 9 & 20 & 16 & 3 & 3 & 39 & 0 & 1 & 35 & 113 & 38 & 29 & 19 & 11 & 346 \\ | |
# people & 18 & 24 & 18 & 43 & 43 & 32 & 1 & 0 & 10 & 19 & 23 & 17 & 27 & 22 & 24 & 321 \\ | |
# heart & 8 & 15 & 14 & 8 & 3 & 18 & 14 & 0 & 0 & 119 & 32 & 14 & 10 & 9 & 5 & 269 \\ | |
# high & 9 & 7 & 6 & 6 & 8 & 8 & 136 & 0 & 0 & 17 & 13 & 12 & 13 & 7 & 22 & 264 \\ | |
# increased & 3 & 6 & 5 & 1 & 3 & 40 & 160 & 0 & 0 & 3 & 4 & 8 & 5 & 7 & 4 & 249 \\ | |
# disease & 7 & 9 & 13 & 7 & 4 & 2 & 21 & 0 & 0 & 9 & 102 & 41 & 17 & 7 & 9 & 248 \\ | |
# health & 20 & 7 & 13 & 9 & 10 & 7 & 110 & 0 & 1 & 11 & 17 & 9 & 11 & 6 & 7 & 238 \\ | |
# risked & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 213 & 0 & 0 & 3 & 2 & 2 & 2 & 6 & 228 \\ | |
# percent & 12 & 17 & 9 & 14 & 5 & 44 & 38 & 0 & 0 & 0 & 11 & 12 & 20 & 15 & 26 & 223 \\ | |
# reduce & 3 & 2 & 2 & 10 & 10 & 125 & 32 & 0 & 0 & 0 & 0 & 1 & 5 & 2 & 0 & 192 \\ | |
# factors & 2 & 1 & 8 & 4 & 6 & 1 & 0 & 0 & 144 & 0 & 3 & 0 & 4 & 6 & 3 & 182 \\ | |
# financial & 8 & 10 & 9 & 15 & 6 & 4 & 51 & 0 & 5 & 10 & 21 & 6 & 16 & 14 & 7 & 182 \\ | |
# higher & 12 & 1 & 9 & 2 & 16 & 8 & 82 & 0 & 2 & 8 & 5 & 7 & 12 & 11 & 2 & 177 \\ | |
# risking & 4 & 2 & 1 & 0 & 0 & 0 & 0 & 165 & 0 & 0 & 0 & 0 & 1 & 3 & 0 & 176 \\ | |
# potential & 3 & 5 & 7 & 6 & 3 & 20 & 73 & 0 & 1 & 14 & 10 & 8 & 8 & 5 & 5 & 168 \\ | |
# women & 14 & 8 & 23 & 17 & 7 & 13 & 0 & 0 & 5 & 10 & 8 & 9 & 16 & 14 & 15 & 159 \\ | |
# bank & 15 & 11 & 13 & 9 & 15 & 18 & 5 & 0 & 1 & 7 & 12 & 16 & 10 & 10 & 13 & 155 \\ | |
# greater & 3 & 3 & 5 & 4 & 7 & 20 & 85 & 0 & 1 & 5 & 3 & 5 & 4 & 3 & 4 & 152 \\ | |
# political & 5 & 3 & 5 & 7 & 4 & 0 & 90 & 0 & 0 & 10 & 5 & 8 & 1 & 7 & 6 & 151 \\ | |
# banks & 10 & 17 & 13 & 14 & 11 & 8 & 1 & 0 & 0 & 12 & 10 & 17 & 17 & 11 & 9 & 150 \\ | |
# management & 4 & 6 & 1 & 0 & 2 & 4 & 0 & 0 & 100 & 0 & 12 & 5 & 5 & 3 & 6 & 148 \\ | |
# breast & 4 & 11 & 10 & 3 & 1 & 14 & 0 & 0 & 0 & 60 & 21 & 6 & 7 & 5 & 5 & 147 \\ | |
# \bottomrule | |
# \end{tabular} | |
print data['2013'].to_csv() | |
# ,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,Total | |
# risk,93,92,58,47,37,16,1,6591,0,11,33,49,48,77,96,7249 | |
# risks,32,24,12,17,24,3,0,2217,1,2,22,12,23,29,26,2444 | |
# risky,4,4,8,1,3,2,0,391,0,2,2,3,6,4,2,432 | |
# cancer,10,9,20,16,3,3,39,0,1,35,113,38,29,19,11,346 | |
# people,18,24,18,43,43,32,1,0,10,19,23,17,27,22,24,321 | |
# heart,8,15,14,8,3,18,14,0,0,119,32,14,10,9,5,269 | |
# high,9,7,6,6,8,8,136,0,0,17,13,12,13,7,22,264 | |
# increased,3,6,5,1,3,40,160,0,0,3,4,8,5,7,4,249 | |
# disease,7,9,13,7,4,2,21,0,0,9,102,41,17,7,9,248 | |
# health,20,7,13,9,10,7,110,0,1,11,17,9,11,6,7,238 | |
# risked,0,0,0,0,0,0,0,213,0,0,3,2,2,2,6,228 | |
# percent,12,17,9,14,5,44,38,0,0,0,11,12,20,15,26,223 | |
# reduce,3,2,2,10,10,125,32,0,0,0,0,1,5,2,0,192 | |
# factors,2,1,8,4,6,1,0,0,144,0,3,0,4,6,3,182 | |
# financial,8,10,9,15,6,4,51,0,5,10,21,6,16,14,7,182 | |
# higher,12,1,9,2,16,8,82,0,2,8,5,7,12,11,2,177 | |
# risking,4,2,1,0,0,0,0,165,0,0,0,0,1,3,0,176 | |
# potential,3,5,7,6,3,20,73,0,1,14,10,8,8,5,5,168 | |
# women,14,8,23,17,7,13,0,0,5,10,8,9,16,14,15,159 | |
# bank,15,11,13,9,15,18,5,0,1,7,12,16,10,10,13,155 | |
# greater,3,3,5,4,7,20,85,0,1,5,3,5,4,3,4,152 | |
# political,5,3,5,7,4,0,90,0,0,10,5,8,1,7,6,151 | |
# banks,10,17,13,14,11,8,1,0,0,12,10,17,17,11,9,150 | |
# management,4,6,1,0,2,4,0,0,100,0,12,5,5,3,6,148 | |
# breast,4,11,10,3,1,14,0,0,0,60,21,6,7,5,5,147 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment