Last active
October 12, 2015 00:48
-
-
Save language-engineering/3945886 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from math import log | |
| from sussex_nltk.corpus_readers import WSJCorpusReader | |
| def get_entropy_ambiguity(word): | |
| # Get the PoS ambiguity of *word* according to its occurrence in WSJ | |
| pos_counts = {} # keep track of the number of times *word* | |
| # appears with each PoS tag | |
| for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ | |
| if token == word: # if this token is the word we're interested in | |
| try: | |
| pos_counts[tag] += 1 # if we've seen the tag before, increment the | |
| # counter keeping track of occurrences | |
| except KeyError: # Otherwise a KeyError will be thrown, catch it | |
| pos_counts[tag] = 1 # Then start the counter at 1 for that tag | |
| return entropy(pos_counts.values()) # return the entropy of the counts | |
| def entropy(counts): # counts = list of counts of occurrences of tags | |
| total = sum(counts) # get total number of occurrences | |
| if not total: return 0 # if zero occurrences in total, then 0 entropy | |
| entropy = 0 | |
| for i in counts: # for each tag count | |
| p = i/float(total) # probability that the token occurs with this tag | |
| try: | |
| entropy += p * log(p,2) # add to entropy | |
| except ValueError: pass # if p==0, then ignore this p | |
| return -entropy if entropy else entropy # only negate if nonzero, otherwise | |
| # floats can return -0.0, which is weird. | |
| # Usage: | |
| print 'Ambiguity of "blue": %s' % get_entropy_ambiguity("blue") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment