Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Last active October 12, 2015 00:48
Show Gist options
  • Select an option

  • Save language-engineering/3945886 to your computer and use it in GitHub Desktop.

Select an option

Save language-engineering/3945886 to your computer and use it in GitHub Desktop.
from math import log
from sussex_nltk.corpus_readers import WSJCorpusReader
def get_entropy_ambiguity(word):
# Get the PoS ambiguity of *word* according to its occurrence in WSJ
pos_counts = {} # keep track of the number of times *word*
# appears with each PoS tag
for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ
if token == word: # if this token is the word we're interested in
try:
pos_counts[tag] += 1 # if we've seen the tag before, increment the
# counter keeping track of occurrences
except KeyError: # Otherwise a KeyError will be thrown, catch it
pos_counts[tag] = 1 # Then start the counter at 1 for that tag
return entropy(pos_counts.values()) # return the entropy of the counts
def entropy(counts): # counts = list of counts of occurrences of tags
total = sum(counts) # get total number of occurrences
if not total: return 0 # if zero occurrences in total, then 0 entropy
entropy = 0
for i in counts: # for each tag count
p = i/float(total) # probability that the token occurs with this tag
try:
entropy += p * log(p,2) # add to entropy
except ValueError: pass # if p==0, then ignore this p
return -entropy if entropy else entropy # only negate if nonzero, otherwise
# floats can return -0.0, which is weird.
# Usage:
print 'Ambiguity of "blue": %s' % get_entropy_ambiguity("blue")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment