Skip to content

Instantly share code, notes, and snippets.

@interrogator
Last active August 29, 2015 14:21
Show Gist options
  • Select an option

  • Save interrogator/9feebf3bb571db23265e to your computer and use it in GitHub Desktop.

Select an option

Save interrogator/9feebf3bb571db23265e to your computer and use it in GitHub Desktop.
eugener-code
def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10):
import os
import nltk
import re
from collections import Counter
import pandas as pd
# get list of subcorpora
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
# define risk word
regex = r'(?i)\brisk'
# place for our output
all_data = []
for corpus in dirs:
print 'Doing %s ... ' % corpus
# search the corpus for whole sents containing risk word
pathed = os.path.join(path, corpus)
results = !tregex.sh -t -w '/(?i)\brisk.?/' $pathed
# remove blank lines, etc
results = [result for result in results if re.search(regex, result)]
# lowercase
results = [result.lower() for result in results]
# make into single string
results = '\n'.join(results)
# tokenise the string
results = nltk.word_tokenize(results)
# a place for info about each corpus
dta = [corpus]
# go left and right depth times
all_words = []
for i in range(-depth, (depth + 1)):
newdict = Counter()
# exclude the 0 iteration, because it will just be risk words
print 'Depth: %d...' % i
matching = []
# go through each token
for index, token in enumerate(results):
# if token matches risk expression
if re.search(regex, token):
# get the word at depth index
# try statement for cases where the target word index isn't there
try:
if i < 0:
num = index - abs(i)
matching.append(results[num])
else:
matching.append(results[index + i])
except:
pass
# tally results
counted = Counter(matching)
# remove punctuation etc
for key in counted:
if key.isalnum():
newdict[key] = counted[key]
for w in counted.keys():
all_words.append(w)
top_tokens = newdict.most_common(top)
# so, we now have newdict, containg the data
# add to our intermediate list
dta.append([i, top_tokens])
# add intermediate to main
all_data.append(dta)
return all_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment