Created
March 30, 2013 15:08
-
-
Save vaishaks/5277040 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
__author__ = "vaishaks <[email protected]>" | |
__date__ = "Mar 25, 2013" | |
import sys | |
from collections import defaultdict | |
emission_counts = defaultdict(float) | |
unigram_counts = defaultdict(float) | |
def create_counts(filename): | |
"""The emission counts from gene.count is read from the file and | |
is stored in a defaultdict in the following format. | |
{'word+tag':number-of-times-word-was-tagged-with-tag} | |
The unigram counts from gene.count is read from the file and | |
is stored in a defaultdict in the following format. | |
{'tag':total-number-of-times-tag-occured} | |
""" | |
try: | |
count_file = open(filename, 'r') | |
except IOError: | |
sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % filename) | |
sys.exit(1) | |
for line in count_file: | |
line_list = line.split() | |
if line_list[1] == 'WORDTAG': | |
if not emission_counts.has_key(line_list[3]+"O"): | |
emission_counts[line_list[3]+"O"] = 0.0 | |
if not emission_counts.has_key(line_list[3]+"I-GENE"): | |
emission_counts[line_list[3]+"I-GENE"] = 0.0 | |
emission_counts[line_list[3]+line_list[2]] = float(line_list[0]) | |
elif line_list[1] == '1-GRAM': | |
unigram_counts[line_list[2]] = float(line_list[0]) | |
def map_infrequent_words(): | |
"""Words occuring less than 5 times in the training data | |
are marked as _RARE_ to account for unknown words. | |
""" | |
training_file = open('gene.train', 'r') | |
new_training_file = open('gene_rare.train', 'w') | |
for line in training_file: | |
line_list = line.split() | |
if line == '\n': | |
new_training_file.write(line) | |
elif emission_counts[line_list[0]+line_list[1]] < 5: | |
new_training_file.write("_RARE_ "+line_list[1]+"\n") | |
else: | |
new_training_file.write(line) | |
def emission(word, tag): | |
"""Returns e(x|y), where 'e' is the emission parameter, x is the | |
word and y is the tag. | |
e(x|y) = Count(y->x)/Count(y)g | |
Count(y->x) is the emission count where x is tagged as y. | |
Count(y) is the unigram count or the total number of times words | |
are tagged as y. | |
""" | |
if(emission_counts.has_key(word+tag)): | |
return (emission_counts[word+tag]/unigram_counts[tag]) | |
else: | |
return(emission_counts['_RARE_'+tag]/unigram_counts[tag]) | |
def tagger(word): | |
"""It takes a word as input and returns the tag with | |
the maximum probability of being paired with that word. | |
""" | |
tag_counts = defaultdict(float) | |
tags = unigram_counts.keys() | |
for tag in tags: | |
tag_counts[tag] = emission(word, tag) | |
max_tag_emission = max(tag_counts.values()) | |
for tag in tags: | |
if tag_counts[tag] == max_tag_emission: | |
return tag | |
def file_tagger(): | |
"""Reads a file which is not tagged and tags each word | |
with its corresponding tag and writes it into another file. | |
""" | |
inputfile = open('gene.dev', 'r') | |
outputfile = open('gene_dev.p1.out', 'w') | |
for word in inputfile: | |
if word == '\n': | |
outputfile.write('\n') | |
else: | |
tag = tagger(word[:len(word)-1]) | |
outputfile.write(word[:len(word)-1]+" "+tag+"\n") | |
create_counts("gene.count") | |
map_infrequent_words() | |
create_counts("gene_rare.count") | |
#print emission_counts['_RARE_O'] | |
#print emission_counts['_RARE_I-GENE'] | |
#print emission('Pol', 'O') | |
#print tagger('vai') | |
file_tagger() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment