vaishaks · March 30, 2013 15:08
diff --git a/emission.py b/emission.py
 #!/usr/bin/env python

 __author__ = "vaishaks <[email protected]>"
 __date__ = "Mar 25, 2013"

 import sys
 from collections import defaultdict

 emission_counts = defaultdict(float)
 unigram_counts = defaultdict(float)

 def create_counts(filename):
 	"""The emission counts from gene.count is read from the file and 
 	is stored in a defaultdict in the following format.
 	{'word+tag':number-of-times-word-was-tagged-with-tag}
 	The unigram counts from gene.count is read from the file and 
 	is stored in a defaultdict in the following format.
 	{'tag':total-number-of-times-tag-occured}
 	"""
 	try:
 		count_file = open(filename, 'r')
 	except IOError:
 		sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % filename)
 		sys.exit(1)
 	for line in count_file:
 		line_list = line.split()
 		if line_list[1] == 'WORDTAG':
 			if not emission_counts.has_key(line_list[3]+"O"):
 				emission_counts[line_list[3]+"O"] = 0.0
 			if not emission_counts.has_key(line_list[3]+"I-GENE"):
 				emission_counts[line_list[3]+"I-GENE"] = 0.0
 			emission_counts[line_list[3]+line_list[2]] = float(line_list[0])
 		elif line_list[1] == '1-GRAM':
 			unigram_counts[line_list[2]] = float(line_list[0])

 def map_infrequent_words():
 	"""Words occuring less than 5 times in the training data 
 	are marked as _RARE_ to account for unknown words.
 	"""
 	training_file = open('gene.train', 'r')
 	new_training_file = open('gene_rare.train', 'w')
 	for line in training_file:
 		line_list = line.split()
 		if line == '\n':
 			new_training_file.write(line)
 		elif emission_counts[line_list[0]+line_list[1]] < 5:
 			new_training_file.write("_RARE_ "+line_list[1]+"\n")
 		else:
 			new_training_file.write(line)

 def emission(word, tag):
 	"""Returns e(x|y), where 'e' is the emission parameter, x is the 
 	word and y is the tag.
 	e(x|y) = Count(y->x)/Count(y)g
 	Count(y->x) is the emission count where x is tagged as y.
 	Count(y) is the unigram count or the total number of times words 
 	are tagged as y.
 	"""
 	if(emission_counts.has_key(word+tag)):
 		return (emission_counts[word+tag]/unigram_counts[tag])
 	else:
 		return(emission_counts['_RARE_'+tag]/unigram_counts[tag])

 def tagger(word):
 	"""It takes a word as input and returns the tag with 
 	the maximum probability of being paired with that word.
 	"""
 	tag_counts = defaultdict(float)
 	tags = unigram_counts.keys()
 	for tag in tags:
 		tag_counts[tag] = emission(word, tag)
 	max_tag_emission = max(tag_counts.values())
 	for tag in tags:
 		if tag_counts[tag] == max_tag_emission:
 			return tag

 def file_tagger():
 	"""Reads a file which is not tagged and tags each word 
 	with its corresponding tag and writes it into another file.
 	"""
 	inputfile = open('gene.dev', 'r')
 	outputfile = open('gene_dev.p1.out', 'w')
 	for word in inputfile:
 		if word == '\n':
 			outputfile.write('\n')
 		else:
 			tag = tagger(word[:len(word)-1])
 			outputfile.write(word[:len(word)-1]+" "+tag+"\n")

 create_counts("gene.count")
 map_infrequent_words()
 create_counts("gene_rare.count")
 #print emission_counts['_RARE_O']
 #print emission_counts['_RARE_I-GENE']
 #print emission('Pol', 'O')
 #print tagger('vai')
 file_tagger()
	#!/usr/bin/env python

	__author__ = "vaishaks <[email protected]>"
	__date__ = "Mar 25, 2013"

	import sys
	from collections import defaultdict

	emission_counts = defaultdict(float)
	unigram_counts = defaultdict(float)

	def create_counts(filename):
	"""The emission counts from gene.count is read from the file and
	is stored in a defaultdict in the following format.
	{'word+tag':number-of-times-word-was-tagged-with-tag}
	The unigram counts from gene.count is read from the file and
	is stored in a defaultdict in the following format.
	{'tag':total-number-of-times-tag-occured}
	"""
	try:
	count_file = open(filename, 'r')
	except IOError:
	sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % filename)
	sys.exit(1)
	for line in count_file:
	line_list = line.split()
	if line_list[1] == 'WORDTAG':
	if not emission_counts.has_key(line_list[3]+"O"):
	emission_counts[line_list[3]+"O"] = 0.0
	if not emission_counts.has_key(line_list[3]+"I-GENE"):
	emission_counts[line_list[3]+"I-GENE"] = 0.0
	emission_counts[line_list[3]+line_list[2]] = float(line_list[0])
	elif line_list[1] == '1-GRAM':
	unigram_counts[line_list[2]] = float(line_list[0])

	def map_infrequent_words():
	"""Words occuring less than 5 times in the training data
	are marked as _RARE_ to account for unknown words.
	"""
	training_file = open('gene.train', 'r')
	new_training_file = open('gene_rare.train', 'w')
	for line in training_file:
	line_list = line.split()
	if line == '\n':
	new_training_file.write(line)
	elif emission_counts[line_list[0]+line_list[1]] < 5:
	new_training_file.write("_RARE_ "+line_list[1]+"\n")
	else:
	new_training_file.write(line)

	def emission(word, tag):
	"""Returns e(x\|y), where 'e' is the emission parameter, x is the
	word and y is the tag.
	e(x\|y) = Count(y->x)/Count(y)g
	Count(y->x) is the emission count where x is tagged as y.
	Count(y) is the unigram count or the total number of times words
	are tagged as y.
	"""
	if(emission_counts.has_key(word+tag)):
	return (emission_counts[word+tag]/unigram_counts[tag])
	else:
	return(emission_counts['_RARE_'+tag]/unigram_counts[tag])

	def tagger(word):
	"""It takes a word as input and returns the tag with
	the maximum probability of being paired with that word.
	"""
	tag_counts = defaultdict(float)
	tags = unigram_counts.keys()
	for tag in tags:
	tag_counts[tag] = emission(word, tag)
	max_tag_emission = max(tag_counts.values())
	for tag in tags:
	if tag_counts[tag] == max_tag_emission:
	return tag

	def file_tagger():
	"""Reads a file which is not tagged and tags each word
	with its corresponding tag and writes it into another file.
	"""
	inputfile = open('gene.dev', 'r')
	outputfile = open('gene_dev.p1.out', 'w')
	for word in inputfile:
	if word == '\n':
	outputfile.write('\n')
	else:
	tag = tagger(word[:len(word)-1])
	outputfile.write(word[:len(word)-1]+" "+tag+"\n")

	create_counts("gene.count")
	map_infrequent_words()
	create_counts("gene_rare.count")
	#print emission_counts['_RARE_O']
	#print emission_counts['_RARE_I-GENE']
	#print emission('Pol', 'O')
	#print tagger('vai')
	file_tagger()