Skip to content

Instantly share code, notes, and snippets.

@vaishaks
Created March 30, 2013 15:08
Show Gist options
  • Save vaishaks/5277040 to your computer and use it in GitHub Desktop.
Save vaishaks/5277040 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
__author__ = "vaishaks <[email protected]>"
__date__ = "Mar 25, 2013"
import sys
from collections import defaultdict
emission_counts = defaultdict(float)
unigram_counts = defaultdict(float)
def create_counts(filename):
"""The emission counts from gene.count is read from the file and
is stored in a defaultdict in the following format.
{'word+tag':number-of-times-word-was-tagged-with-tag}
The unigram counts from gene.count is read from the file and
is stored in a defaultdict in the following format.
{'tag':total-number-of-times-tag-occured}
"""
try:
count_file = open(filename, 'r')
except IOError:
sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % filename)
sys.exit(1)
for line in count_file:
line_list = line.split()
if line_list[1] == 'WORDTAG':
if not emission_counts.has_key(line_list[3]+"O"):
emission_counts[line_list[3]+"O"] = 0.0
if not emission_counts.has_key(line_list[3]+"I-GENE"):
emission_counts[line_list[3]+"I-GENE"] = 0.0
emission_counts[line_list[3]+line_list[2]] = float(line_list[0])
elif line_list[1] == '1-GRAM':
unigram_counts[line_list[2]] = float(line_list[0])
def map_infrequent_words():
"""Words occuring less than 5 times in the training data
are marked as _RARE_ to account for unknown words.
"""
training_file = open('gene.train', 'r')
new_training_file = open('gene_rare.train', 'w')
for line in training_file:
line_list = line.split()
if line == '\n':
new_training_file.write(line)
elif emission_counts[line_list[0]+line_list[1]] < 5:
new_training_file.write("_RARE_ "+line_list[1]+"\n")
else:
new_training_file.write(line)
def emission(word, tag):
"""Returns e(x|y), where 'e' is the emission parameter, x is the
word and y is the tag.
e(x|y) = Count(y->x)/Count(y)g
Count(y->x) is the emission count where x is tagged as y.
Count(y) is the unigram count or the total number of times words
are tagged as y.
"""
if(emission_counts.has_key(word+tag)):
return (emission_counts[word+tag]/unigram_counts[tag])
else:
return(emission_counts['_RARE_'+tag]/unigram_counts[tag])
def tagger(word):
"""It takes a word as input and returns the tag with
the maximum probability of being paired with that word.
"""
tag_counts = defaultdict(float)
tags = unigram_counts.keys()
for tag in tags:
tag_counts[tag] = emission(word, tag)
max_tag_emission = max(tag_counts.values())
for tag in tags:
if tag_counts[tag] == max_tag_emission:
return tag
def file_tagger():
"""Reads a file which is not tagged and tags each word
with its corresponding tag and writes it into another file.
"""
inputfile = open('gene.dev', 'r')
outputfile = open('gene_dev.p1.out', 'w')
for word in inputfile:
if word == '\n':
outputfile.write('\n')
else:
tag = tagger(word[:len(word)-1])
outputfile.write(word[:len(word)-1]+" "+tag+"\n")
create_counts("gene.count")
map_infrequent_words()
create_counts("gene_rare.count")
#print emission_counts['_RARE_O']
#print emission_counts['_RARE_I-GENE']
#print emission('Pol', 'O')
#print tagger('vai')
file_tagger()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment