Created
October 16, 2015 18:50
-
-
Save sandsfish/de58318b0bae5d06c056 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# all_summary is the data here. in this case, just a lot of text records collapsed into on corpus string | |
# Prep data for NLTK Analysis | |
import nltk.collocations | |
tokens = nltk.word_tokenize(all_summary) | |
text = nltk.Text(tokens) | |
# Remove stop-words, convert to lower-case, remove all non-alpha characters | |
from nltk.corpus import stopwords | |
stopwords = stopwords.words('english') | |
text1 = nltk.Text([w.lower() for w in text if w.isalpha()]) | |
text2 = [w for w in text1 if w not in stopwords] | |
# 200 Best 'Bigram Collocations' by Score - Write to File | |
from nltk.collocations import BigramCollocationFinder | |
from nltk.metrics import BigramAssocMeasures | |
from nltk import collocations | |
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200, freq=5): | |
bigram_finder = BigramCollocationFinder.from_words(words) | |
bigram_finder.apply_freq_filter(freq) | |
bigrams = bigram_finder.nbest(score_fn, n) | |
with open('best_{0}_bigram_collocations_more_than_{1}_occurrences_pmi.txt'.format(n, freq), 'wb') as f: | |
for b in bigrams: | |
f.write("{0} {1}\n".format(b[0], b[1])) | |
return bigrams | |
best_bigrams = best_bigram_word_feats(text2, score_fn=BigramAssocMeasures.pmi, n=200, freq=20) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment