Skip to content

Instantly share code, notes, and snippets.

@interrogator
Created May 19, 2015 13:20
Show Gist options
  • Save interrogator/cf42d9e3faf44a2be55b to your computer and use it in GitHub Desktop.
Save interrogator/cf42d9e3faf44a2be55b to your computer and use it in GitHub Desktop.
eugener-code-and-output
#eugene script
def eugener(path = 'data/nyt/earlylate',
regex = r'(?i)\brisk',
depth = 5,
top = 10,
remove_stopwords = False):
"""
get most frequent words in corpus path to left and right
"""
import os
import nltk
import re
from StringIO import StringIO
from collections import Counter
import pandas as pd
from dictionaries.stopwords import stopwords as stopwords
# get list of subcorpora
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
# define risk word
# place for our output
dfs = {}
for corpus in dirs:
print 'Doing %s ... ' % corpus
# search the corpus for whole sents containing risk word
pathed = os.path.join(path, corpus)
results = !./tregex.sh -t -w "/$regex/" $pathed
# remove blank lines, etc
results = [result for result in results if re.search(regex, result)]
# lowercase
results = [result.lower() for result in results]
# make into single string
results = '\n'.join(results)
# tokenise the string
results = nltk.word_tokenize(results)
# a place for info about each corpus
# go left and right depth times
all_words = []
dicts = []
for i in range(-depth, (depth + 1)):
newdict = Counter()
# exclude the 0 iteration, because it will just be risk words
print 'Depth: %d...' % i
matching = []
# go through each token
for index, token in enumerate(results):
# if token matches risk expression
if re.search(regex, token):
# get the word at depth index
# try statement for cases where the target word index isn't there
try:
if i < 0:
num = index - abs(i)
matching.append(results[num])
else:
matching.append(results[index + i])
except:
pass
# tally results
counted = Counter(matching)
# remove punctuation etc
for key in counted:
if key.isalnum():
#if key not in stopwords:
if remove_stopwords:
if key not in stopwords:
newdict[key] = counted[key]
else:
newdict[key] = counted[key]
for w in counted.keys():
all_words.append(w)
#top_tokens = newdict.most_common(top)
dicts.append(newdict)
# make pandas series
print 'Making DataFrame ... '
sers = []
for word in list(set(all_words)):
series = [dct[word] for dct in dicts]
series.append(sum([dct[word] for dct in dicts]))
index_names = range(-depth, (depth + 1))
index_names.append('Total')
ser = pd.Series(series, index = index_names)
ser.name = word
sers.append(ser)
# concatenate series
df = pd.concat(sers, axis=1)
# remove zero depth
#df = df.drop(df.index[depth])
# sort by total
tot = df.ix['Total']
df = df[tot.argsort()[::-1]]
# just top entries
df = pd.DataFrame(df[list(df.columns)[:top]])
#transpose
dfs[corpus] = df.T
return dfs
data = eugener(depth = 7, top = 25, remove_stopwords=True)
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print data['1995']
# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total
# risk 73 59 39 30 21 6 0 4749 0 11 23 26 33 61 72 5203
# risks 32 22 11 13 6 3 0 1611 0 0 5 12 15 25 30 1785
# risky 12 8 8 5 4 0 0 684 0 0 1 8 5 9 7 751
# high 8 8 7 6 4 7 127 0 2 4 8 6 7 5 8 207
# people 16 17 13 22 24 16 1 0 5 10 24 7 20 13 14 202
# cancer 4 4 3 1 3 0 47 0 0 26 66 16 9 13 5 197
# health 6 6 5 10 9 1 91 0 4 14 14 6 11 7 10 194
# heart 3 3 2 1 2 17 6 0 1 76 38 14 5 5 9 182
# disease 10 2 4 1 4 0 16 0 0 8 54 42 18 8 6 173
# risking 1 1 1 0 0 0 0 157 0 0 0 1 2 1 2 166
# women 6 12 17 22 14 7 0 0 8 5 11 21 20 13 7 163
# risked 0 0 0 0 0 0 0 141 0 0 0 3 0 3 3 150
# political 6 6 1 8 8 0 65 0 7 10 6 6 5 5 11 144
# reduce 3 2 1 3 7 93 22 0 0 0 3 0 1 2 2 139
# percent 17 7 13 11 12 15 6 0 0 0 9 6 10 17 12 135
# business 8 3 9 10 10 0 5 0 52 2 11 6 6 8 4 134
# factors 5 1 3 6 2 0 0 0 110 1 0 3 0 2 1 134
# losing 2 1 3 2 0 0 0 0 84 26 1 1 2 3 3 128
# increased 2 2 2 3 1 18 84 0 0 6 1 3 2 1 2 127
# investors 11 14 14 13 19 1 2 0 4 9 12 8 6 5 6 124
# greater 4 6 5 4 6 6 63 0 2 3 2 7 7 5 3 123
# death 1 3 2 2 1 0 6 0 7 29 29 16 7 8 2 113
# financial 2 7 6 6 2 1 33 0 5 10 6 9 7 6 7 107
# american 18 8 9 5 11 2 1 0 7 11 8 8 7 6 4 105
# lives 1 6 6 9 2 7 0 0 3 54 8 5 2 0 2 105
print data['2013']
# -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 Total
# risk 93 92 58 47 37 16 1 6591 0 11 33 49 48 77 96 7249
# risks 32 24 12 17 24 3 0 2217 1 2 22 12 23 29 26 2444
# risky 4 4 8 1 3 2 0 391 0 2 2 3 6 4 2 432
# cancer 10 9 20 16 3 3 39 0 1 35 113 38 29 19 11 346
# people 18 24 18 43 43 32 1 0 10 19 23 17 27 22 24 321
# heart 8 15 14 8 3 18 14 0 0 119 32 14 10 9 5 269
# high 9 7 6 6 8 8 136 0 0 17 13 12 13 7 22 264
# increased 3 6 5 1 3 40 160 0 0 3 4 8 5 7 4 249
# disease 7 9 13 7 4 2 21 0 0 9 102 41 17 7 9 248
# health 20 7 13 9 10 7 110 0 1 11 17 9 11 6 7 238
# risked 0 0 0 0 0 0 0 213 0 0 3 2 2 2 6 228
# percent 12 17 9 14 5 44 38 0 0 0 11 12 20 15 26 223
# reduce 3 2 2 10 10 125 32 0 0 0 0 1 5 2 0 192
# factors 2 1 8 4 6 1 0 0 144 0 3 0 4 6 3 182
# financial 8 10 9 15 6 4 51 0 5 10 21 6 16 14 7 182
# higher 12 1 9 2 16 8 82 0 2 8 5 7 12 11 2 177
# risking 4 2 1 0 0 0 0 165 0 0 0 0 1 3 0 176
# potential 3 5 7 6 3 20 73 0 1 14 10 8 8 5 5 168
# women 14 8 23 17 7 13 0 0 5 10 8 9 16 14 15 159
# bank 15 11 13 9 15 18 5 0 1 7 12 16 10 10 13 155
# greater 3 3 5 4 7 20 85 0 1 5 3 5 4 3 4 152
# political 5 3 5 7 4 0 90 0 0 10 5 8 1 7 6 151
# banks 10 17 13 14 11 8 1 0 0 12 10 17 17 11 9 150
# management 4 6 1 0 2 4 0 0 100 0 12 5 5 3 6 148
# breast 4 11 10 3 1 14 0 0 0 60 21 6 7 5 5 147
print data['2013'].to_latex()
# \begin{tabular}{lrrrrrrrrrrrrrrrr}
# \toprule
# {} & -7 & -6 & -5 & -4 & -3 & -2 & -1 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & Total \\
# \midrule
# risk & 93 & 92 & 58 & 47 & 37 & 16 & 1 & 6591 & 0 & 11 & 33 & 49 & 48 & 77 & 96 & 7249 \\
# risks & 32 & 24 & 12 & 17 & 24 & 3 & 0 & 2217 & 1 & 2 & 22 & 12 & 23 & 29 & 26 & 2444 \\
# risky & 4 & 4 & 8 & 1 & 3 & 2 & 0 & 391 & 0 & 2 & 2 & 3 & 6 & 4 & 2 & 432 \\
# cancer & 10 & 9 & 20 & 16 & 3 & 3 & 39 & 0 & 1 & 35 & 113 & 38 & 29 & 19 & 11 & 346 \\
# people & 18 & 24 & 18 & 43 & 43 & 32 & 1 & 0 & 10 & 19 & 23 & 17 & 27 & 22 & 24 & 321 \\
# heart & 8 & 15 & 14 & 8 & 3 & 18 & 14 & 0 & 0 & 119 & 32 & 14 & 10 & 9 & 5 & 269 \\
# high & 9 & 7 & 6 & 6 & 8 & 8 & 136 & 0 & 0 & 17 & 13 & 12 & 13 & 7 & 22 & 264 \\
# increased & 3 & 6 & 5 & 1 & 3 & 40 & 160 & 0 & 0 & 3 & 4 & 8 & 5 & 7 & 4 & 249 \\
# disease & 7 & 9 & 13 & 7 & 4 & 2 & 21 & 0 & 0 & 9 & 102 & 41 & 17 & 7 & 9 & 248 \\
# health & 20 & 7 & 13 & 9 & 10 & 7 & 110 & 0 & 1 & 11 & 17 & 9 & 11 & 6 & 7 & 238 \\
# risked & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 213 & 0 & 0 & 3 & 2 & 2 & 2 & 6 & 228 \\
# percent & 12 & 17 & 9 & 14 & 5 & 44 & 38 & 0 & 0 & 0 & 11 & 12 & 20 & 15 & 26 & 223 \\
# reduce & 3 & 2 & 2 & 10 & 10 & 125 & 32 & 0 & 0 & 0 & 0 & 1 & 5 & 2 & 0 & 192 \\
# factors & 2 & 1 & 8 & 4 & 6 & 1 & 0 & 0 & 144 & 0 & 3 & 0 & 4 & 6 & 3 & 182 \\
# financial & 8 & 10 & 9 & 15 & 6 & 4 & 51 & 0 & 5 & 10 & 21 & 6 & 16 & 14 & 7 & 182 \\
# higher & 12 & 1 & 9 & 2 & 16 & 8 & 82 & 0 & 2 & 8 & 5 & 7 & 12 & 11 & 2 & 177 \\
# risking & 4 & 2 & 1 & 0 & 0 & 0 & 0 & 165 & 0 & 0 & 0 & 0 & 1 & 3 & 0 & 176 \\
# potential & 3 & 5 & 7 & 6 & 3 & 20 & 73 & 0 & 1 & 14 & 10 & 8 & 8 & 5 & 5 & 168 \\
# women & 14 & 8 & 23 & 17 & 7 & 13 & 0 & 0 & 5 & 10 & 8 & 9 & 16 & 14 & 15 & 159 \\
# bank & 15 & 11 & 13 & 9 & 15 & 18 & 5 & 0 & 1 & 7 & 12 & 16 & 10 & 10 & 13 & 155 \\
# greater & 3 & 3 & 5 & 4 & 7 & 20 & 85 & 0 & 1 & 5 & 3 & 5 & 4 & 3 & 4 & 152 \\
# political & 5 & 3 & 5 & 7 & 4 & 0 & 90 & 0 & 0 & 10 & 5 & 8 & 1 & 7 & 6 & 151 \\
# banks & 10 & 17 & 13 & 14 & 11 & 8 & 1 & 0 & 0 & 12 & 10 & 17 & 17 & 11 & 9 & 150 \\
# management & 4 & 6 & 1 & 0 & 2 & 4 & 0 & 0 & 100 & 0 & 12 & 5 & 5 & 3 & 6 & 148 \\
# breast & 4 & 11 & 10 & 3 & 1 & 14 & 0 & 0 & 0 & 60 & 21 & 6 & 7 & 5 & 5 & 147 \\
# \bottomrule
# \end{tabular}
print data['2013'].to_csv()
# ,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,Total
# risk,93,92,58,47,37,16,1,6591,0,11,33,49,48,77,96,7249
# risks,32,24,12,17,24,3,0,2217,1,2,22,12,23,29,26,2444
# risky,4,4,8,1,3,2,0,391,0,2,2,3,6,4,2,432
# cancer,10,9,20,16,3,3,39,0,1,35,113,38,29,19,11,346
# people,18,24,18,43,43,32,1,0,10,19,23,17,27,22,24,321
# heart,8,15,14,8,3,18,14,0,0,119,32,14,10,9,5,269
# high,9,7,6,6,8,8,136,0,0,17,13,12,13,7,22,264
# increased,3,6,5,1,3,40,160,0,0,3,4,8,5,7,4,249
# disease,7,9,13,7,4,2,21,0,0,9,102,41,17,7,9,248
# health,20,7,13,9,10,7,110,0,1,11,17,9,11,6,7,238
# risked,0,0,0,0,0,0,0,213,0,0,3,2,2,2,6,228
# percent,12,17,9,14,5,44,38,0,0,0,11,12,20,15,26,223
# reduce,3,2,2,10,10,125,32,0,0,0,0,1,5,2,0,192
# factors,2,1,8,4,6,1,0,0,144,0,3,0,4,6,3,182
# financial,8,10,9,15,6,4,51,0,5,10,21,6,16,14,7,182
# higher,12,1,9,2,16,8,82,0,2,8,5,7,12,11,2,177
# risking,4,2,1,0,0,0,0,165,0,0,0,0,1,3,0,176
# potential,3,5,7,6,3,20,73,0,1,14,10,8,8,5,5,168
# women,14,8,23,17,7,13,0,0,5,10,8,9,16,14,15,159
# bank,15,11,13,9,15,18,5,0,1,7,12,16,10,10,13,155
# greater,3,3,5,4,7,20,85,0,1,5,3,5,4,3,4,152
# political,5,3,5,7,4,0,90,0,0,10,5,8,1,7,6,151
# banks,10,17,13,14,11,8,1,0,0,12,10,17,17,11,9,150
# management,4,6,1,0,2,4,0,0,100,0,12,5,5,3,6,148
# breast,4,11,10,3,1,14,0,0,0,60,21,6,7,5,5,147
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment