language-engineering · October 12, 2015 00:48
diff --git a/gistfile1.py b/gistfile1.py
 from math import log
 from sussex_nltk.corpus_readers import WSJCorpusReader

 def get_entropy_ambiguity(word):
 # Get the PoS ambiguity of *word* according to its occurrence in WSJ
    pos_counts = {}                     # keep track of the number of times *word* 
                                        # appears with each PoS tag
    for token, tag in WSJCorpusReader().tagged_words():   #for each token and tag in WSJ
        if token == word:               # if this token is the word we're interested in
            try:
                pos_counts[tag] += 1    # if we've seen the tag before, increment the 
                                        # counter keeping track of occurrences
            except KeyError:            # Otherwise a KeyError will be thrown, catch it
                pos_counts[tag] = 1     # Then start the counter at 1 for that tag
    return entropy(pos_counts.values()) # return the entropy of the counts
        
 def entropy(counts):            # counts = list of counts of occurrences of tags
    total = sum(counts)         # get total number of occurrences
    if not total: return 0      # if zero occurrences in total, then 0 entropy
    entropy = 0
    for i in counts:            # for each tag count
        p = i/float(total)      # probability that the token occurs with this tag
        try:
            entropy += p * log(p,2) # add to entropy
        except ValueError: pass     # if p==0, then ignore this p
    return -entropy if entropy else entropy   # only negate if nonzero, otherwise 
                                              # floats can return -0.0, which is weird.

 # Usage:
 print 'Ambiguity of "blue": %s' % get_entropy_ambiguity("blue")
	from math import log
	from sussex_nltk.corpus_readers import WSJCorpusReader

	def get_entropy_ambiguity(word):
	# Get the PoS ambiguity of word according to its occurrence in WSJ
	pos_counts = {} # keep track of the number of times word
	# appears with each PoS tag
	for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ
	if token == word: # if this token is the word we're interested in
	try:
	pos_counts[tag] += 1 # if we've seen the tag before, increment the
	# counter keeping track of occurrences
	except KeyError: # Otherwise a KeyError will be thrown, catch it
	pos_counts[tag] = 1 # Then start the counter at 1 for that tag
	return entropy(pos_counts.values()) # return the entropy of the counts

	def entropy(counts): # counts = list of counts of occurrences of tags
	total = sum(counts) # get total number of occurrences
	if not total: return 0 # if zero occurrences in total, then 0 entropy
	entropy = 0
	for i in counts: # for each tag count
	p = i/float(total) # probability that the token occurs with this tag
	try:
	entropy += p * log(p,2) # add to entropy
	except ValueError: pass # if p==0, then ignore this p
	return -entropy if entropy else entropy # only negate if nonzero, otherwise
	# floats can return -0.0, which is weird.

	# Usage:
	print 'Ambiguity of "blue": %s' % get_entropy_ambiguity("blue")
No results found