Created
March 22, 2016 01:23
-
-
Save ipha/9eedbf7f5e3af0f7bbaf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from nltk.corpus import brown | |
PUNCTUATION = "!\"#$%'()*+,-./:;<=>?@[\\]^_`{|}~" | |
MIN_OCCURANCES = 20 | |
NGRAM_LENGTH = 3 | |
# Hold a count of words | |
countdict = {} | |
# Results in form (count, word 1, word 2, word 3) | |
results = list() | |
words = ("",) * NGRAM_LENGTH | |
# Itterate through every word | |
for word in brown.words(): | |
words = words[1:] + (word.lower(),) | |
if words in countdict: | |
countdict[words] = countdict[words] + 1 | |
else: | |
countdict[words] = 1 | |
# Itterate through results filtering out punctuation and counts smaller than MIN_OCCURANCES | |
for ngram in countdict: | |
if countdict[ngram] > MIN_OCCURANCES: | |
if all(word not in PUNCTUATION for word in ngram): | |
results.append((countdict[ngram], ngram)) | |
# Print results sorted by number of occurances | |
for result in sorted(results, key=lambda result: result[0]): | |
print(result) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment