Skip to content

Instantly share code, notes, and snippets.

@Pinak-Chakraborty
Created August 31, 2014 20:06
Show Gist options
  • Save Pinak-Chakraborty/67a931425f856b649ed2 to your computer and use it in GitHub Desktop.
Save Pinak-Chakraborty/67a931425f856b649ed2 to your computer and use it in GitHub Desktop.
Smoothing - Back off Algorithm
#-------------------------------------------------------------------------------
#
# This module determines the sentence probability for all sentences in the test
# data set. It uses Back off method in determing the probability
#
# prob(w1,w2,w3) is calculated as follows:
#
# if trigram count(w1,w2,w3) not zero
# prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2)
# else if bigram count (w1,w2) not zero
# prob = coff2*bigram count (w1,w2)/unigram count (w1)
# else
# prob = coff2*unigram count (w1)/total no of unigrams
#
#-------------------------------------------------------------------------------
import re
coff1 = 0.5 #coff for trigram freq
coff2 = 0.3 #coff for bigram freq
coff3 = 0.2 #coff for unigram freq
def deleted_inter(traindata, testdata):
print ("starting processing the TRAINING data")
trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
totBiTrain = 0 #total bigrams in training data set
totUniTrain = 0 # total unigrams in training data set
totTriTrain = 0 # total trigrams in training data set
testsentCount = 0 # total sentences in training data set
# Loop through the training data to build the unigram and bigram
# training dictionaries
for line in open(traindata):
line = line.rstrip()
testsentCount += 1
words = wordTokenizier(line)
last_w = "UNK"
prev_w = "UNK"
for w in words:
if w in trainUni:
trainUni[w] += 1
else:
trainUni[w] = 1
totUniTrain += 1
biw = prev_w + " " + w
if biw in trainBi:
trainBi[biw] += 1
else:
trainBi[biw] = 1
totBiTrain +=1
triw = last_w + " " + prev_w + " " + w
if triw in trainTri:
trainTri[triw] += 1
else:
trainTri[triw] = 1
totTriTrain +=1
last_w = prev_w
prev_w = w
trainUni["UNK"] = testsentCount
trainBi ["UNK" + " " + "UNK"] = testsentCount
print ("training file processed with total lines ", testsentCount)
# Loop through the test data to calculate sentence probability
# using deleted interpolation using the dictionaries built from training data
sentCount = 0
for line in open(testdata):
#print ("line ", line)
line = line.rstrip()
sentCount += 1
words = wordTokenizier(line)
sentTri = {}
last_w = "UNK"
prev_w = "UNK"
#determine all trigrams in the sentence
for w in words:
triw = last_w + " " + prev_w + " " + w
if triw in sentTri:
sentTri[triw] += 1
else:
sentTri[triw] = 1
last_w = prev_w
prev_w = w
# calculate probability from dictionaries built from training data
# for all trigrams found in the sentence
sentProb = 1
for k in sentTri.keys():
triwordlist = wordTokenizier(k)
w1 = triwordlist[0]
w2 = triwordlist [1]
w1w2 = w1 + " " + w2
# back off to next lower level for unseen trigrams and bigrams
if trainTri.get(k,0) > 0:
sentProb = sentProb*(coff1*trainTri[k]/trainBi[w1w2])
elif trainBi.get(w1w2,0) > 0:
sentProb = sentProb*(coff2*trainBi[w1w2]/trainUni[w1])
else:
sentProb = sentProb*(coff3*trainUi[w1]/totUniTrain)
print ("Sentence no", sentCount, "Probability", sentProb)
#print ("all bigrams ", testBi)
print ("Completed procesing of TEST data")
def wordTokenizier(line):
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
tokenList = re.findall(delimiters, line)
return tokenList
# set the training and test data sets
# call the deleted interpolation function
traindata = "C:\Python34\Data\TOYDataEnglish.txt"
testdata = "C:\Python34\Data\TOYDataEnglish.txt"
deleted_inter (traindata, testdata)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment