Created
August 31, 2014 20:06
-
-
Save Pinak-Chakraborty/67a931425f856b649ed2 to your computer and use it in GitHub Desktop.
Smoothing - Back off Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# | |
# This module determines the sentence probability for all sentences in the test | |
# data set. It uses Back off method in determing the probability | |
# | |
# prob(w1,w2,w3) is calculated as follows: | |
# | |
# if trigram count(w1,w2,w3) not zero | |
# prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2) | |
# else if bigram count (w1,w2) not zero | |
# prob = coff2*bigram count (w1,w2)/unigram count (w1) | |
# else | |
# prob = coff2*unigram count (w1)/total no of unigrams | |
# | |
#------------------------------------------------------------------------------- | |
import re | |
coff1 = 0.5 #coff for trigram freq | |
coff2 = 0.3 #coff for bigram freq | |
coff3 = 0.2 #coff for unigram freq | |
def deleted_inter(traindata, testdata): | |
print ("starting processing the TRAINING data") | |
trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams | |
totBiTrain = 0 #total bigrams in training data set | |
totUniTrain = 0 # total unigrams in training data set | |
totTriTrain = 0 # total trigrams in training data set | |
testsentCount = 0 # total sentences in training data set | |
# Loop through the training data to build the unigram and bigram | |
# training dictionaries | |
for line in open(traindata): | |
line = line.rstrip() | |
testsentCount += 1 | |
words = wordTokenizier(line) | |
last_w = "UNK" | |
prev_w = "UNK" | |
for w in words: | |
if w in trainUni: | |
trainUni[w] += 1 | |
else: | |
trainUni[w] = 1 | |
totUniTrain += 1 | |
biw = prev_w + " " + w | |
if biw in trainBi: | |
trainBi[biw] += 1 | |
else: | |
trainBi[biw] = 1 | |
totBiTrain +=1 | |
triw = last_w + " " + prev_w + " " + w | |
if triw in trainTri: | |
trainTri[triw] += 1 | |
else: | |
trainTri[triw] = 1 | |
totTriTrain +=1 | |
last_w = prev_w | |
prev_w = w | |
trainUni["UNK"] = testsentCount | |
trainBi ["UNK" + " " + "UNK"] = testsentCount | |
print ("training file processed with total lines ", testsentCount) | |
# Loop through the test data to calculate sentence probability | |
# using deleted interpolation using the dictionaries built from training data | |
sentCount = 0 | |
for line in open(testdata): | |
#print ("line ", line) | |
line = line.rstrip() | |
sentCount += 1 | |
words = wordTokenizier(line) | |
sentTri = {} | |
last_w = "UNK" | |
prev_w = "UNK" | |
#determine all trigrams in the sentence | |
for w in words: | |
triw = last_w + " " + prev_w + " " + w | |
if triw in sentTri: | |
sentTri[triw] += 1 | |
else: | |
sentTri[triw] = 1 | |
last_w = prev_w | |
prev_w = w | |
# calculate probability from dictionaries built from training data | |
# for all trigrams found in the sentence | |
sentProb = 1 | |
for k in sentTri.keys(): | |
triwordlist = wordTokenizier(k) | |
w1 = triwordlist[0] | |
w2 = triwordlist [1] | |
w1w2 = w1 + " " + w2 | |
# back off to next lower level for unseen trigrams and bigrams | |
if trainTri.get(k,0) > 0: | |
sentProb = sentProb*(coff1*trainTri[k]/trainBi[w1w2]) | |
elif trainBi.get(w1w2,0) > 0: | |
sentProb = sentProb*(coff2*trainBi[w1w2]/trainUni[w1]) | |
else: | |
sentProb = sentProb*(coff3*trainUi[w1]/totUniTrain) | |
print ("Sentence no", sentCount, "Probability", sentProb) | |
#print ("all bigrams ", testBi) | |
print ("Completed procesing of TEST data") | |
def wordTokenizier(line): | |
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]" | |
tokenList = re.findall(delimiters, line) | |
return tokenList | |
# set the training and test data sets | |
# call the deleted interpolation function | |
traindata = "C:\Python34\Data\TOYDataEnglish.txt" | |
testdata = "C:\Python34\Data\TOYDataEnglish.txt" | |
deleted_inter (traindata, testdata) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment