Last active
August 29, 2015 14:07
-
-
Save sangheestyle/dc096c43640c2fae4842 to your computer and use it in GitHub Desktop.
nlp hw5
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict | |
| from nltk.corpus import brown | |
| print ">>> Procssing" | |
| word_tag = defaultdict(set) | |
| for word, tag in brown.tagged_words(): | |
| word_tag[word].add(tag) | |
| num_words_by_num_tags = defaultdict(int) | |
| words_with_max_tags = [None, 0] | |
| for word, tags in word_tag.iteritems(): | |
| num_words_by_num_tags[len(tags)] += 1 | |
| if len(tags) > words_with_max_tags[1]: | |
| words_with_max_tags = [word, len(tags), tags] | |
| file_name = "ans_hw5_1_2.txt" | |
| f1 = open(file_name, "wb") | |
| print >>f1, ">>> Ans 1.2.1" | |
| h1 ="# distinct tags" | |
| h2 = "# distinct words" | |
| print >>f1, "%-*s %-*s" % (len(" "), h1, len(" "), h2) | |
| for key, val in num_words_by_num_tags.iteritems(): | |
| print >>f1, "%-*d %-*d" % (len(h1), key, len(h2), val) | |
| print >>f1, "\n>>> Ans 1.2.2" | |
| max_word, max_tags = words_with_max_tags[0], words_with_max_tags[2] | |
| print >>f1, max_word, "has", len(max_tags), "distinct tags." | |
| print >>f1, "tags:", | |
| for tag in max_tags: print >>f1, tag, "", | |
| print >>f1, "\nSentences from the corpus containing the word, one for each possible tag.\n" | |
| for index, tag in enumerate(max_tags): | |
| for sent in brown.tagged_sents(): | |
| if (max_word, tag) in sent: | |
| print >>f1, "#", index+1, ":", max_word, "as", tag | |
| print >>f1, " ".join(zip(*sent)[0]) | |
| print >>f1 | |
| break | |
| f1.close() | |
| print "<<< Done, check", file_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment