Skip to content

Instantly share code, notes, and snippets.

@jnothman
Created March 22, 2017 23:48
Show Gist options
  • Save jnothman/6920ab04ed1880d8d3de500b0e10b68a to your computer and use it in GitHub Desktop.
Save jnothman/6920ab04ed1880d8d3de500b0e10b68a to your computer and use it in GitHub Desktop.
from __future__ import print_function
from collections import Counter
import nltk
def count_ngrams(tokens, min_unigram_freq=2, min_ngram_freq=5, max_n=5):
n_tokens = len(tokens)
print('Number of tokens:', n_tokens)
unigram_freqs = Counter(tokens)
print('Number of distinct tokens ("types"):', len(unigram_freqs))
for token, freq in list(unigram_freqs.items()):
if freq < min_unigram_freq:
del unigram_freqs[token]
unigram_freqs['<UNK>'] += freq
print('Number of distinct tokens after thresholding:',
len(unigram_freqs) - 1)
tokens = [tok if tok in unigram_freqs else '<UNK>'
for tok in tokens]
ngram_range = range(2, max_n + 1)
counters = ([None, unigram_freqs]
+ [Counter() for n in ngram_range])
ngrams = nltk.ngrams(tokens, max_n)
for ngram in ngrams:
for n in ngram_range:
counters[n][ngram[:n]] += 1
for n in ngram_range:
before_count = len(counters[n])
for ngram, freq in list(counters[n].items()):
if freq < min_ngram_freq:
del counters[n][ngram]
print('Number of distinct', n, 'grams:', before_count, 'before and',
len(counters[n]), 'after thresholding')
return counters
count_ngrams(nltk.corpus.brown.words())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment