Last active
August 29, 2015 14:21
-
-
Save interrogator/9feebf3bb571db23265e to your computer and use it in GitHub Desktop.
eugener-code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10): | |
| import os | |
| import nltk | |
| import re | |
| from collections import Counter | |
| import pandas as pd | |
| # get list of subcorpora | |
| dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] | |
| # define risk word | |
| regex = r'(?i)\brisk' | |
| # place for our output | |
| all_data = [] | |
| for corpus in dirs: | |
| print 'Doing %s ... ' % corpus | |
| # search the corpus for whole sents containing risk word | |
| pathed = os.path.join(path, corpus) | |
| results = !tregex.sh -t -w '/(?i)\brisk.?/' $pathed | |
| # remove blank lines, etc | |
| results = [result for result in results if re.search(regex, result)] | |
| # lowercase | |
| results = [result.lower() for result in results] | |
| # make into single string | |
| results = '\n'.join(results) | |
| # tokenise the string | |
| results = nltk.word_tokenize(results) | |
| # a place for info about each corpus | |
| dta = [corpus] | |
| # go left and right depth times | |
| all_words = [] | |
| for i in range(-depth, (depth + 1)): | |
| newdict = Counter() | |
| # exclude the 0 iteration, because it will just be risk words | |
| print 'Depth: %d...' % i | |
| matching = [] | |
| # go through each token | |
| for index, token in enumerate(results): | |
| # if token matches risk expression | |
| if re.search(regex, token): | |
| # get the word at depth index | |
| # try statement for cases where the target word index isn't there | |
| try: | |
| if i < 0: | |
| num = index - abs(i) | |
| matching.append(results[num]) | |
| else: | |
| matching.append(results[index + i]) | |
| except: | |
| pass | |
| # tally results | |
| counted = Counter(matching) | |
| # remove punctuation etc | |
| for key in counted: | |
| if key.isalnum(): | |
| newdict[key] = counted[key] | |
| for w in counted.keys(): | |
| all_words.append(w) | |
| top_tokens = newdict.most_common(top) | |
| # so, we now have newdict, containg the data | |
| # add to our intermediate list | |
| dta.append([i, top_tokens]) | |
| # add intermediate to main | |
| all_data.append(dta) | |
| return all_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment