Created
August 31, 2014 20:04
-
-
Save Pinak-Chakraborty/71d947bfddc998f4738c to your computer and use it in GitHub Desktop.
Smoothing using Interpolation and Laplace's Add one Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# | |
# This module determines the sentence probability for all sentences in the test | |
# data set. It uses deleted inerpolation as: (for bigram) | |
# | |
# prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ ( | |
# (coff2*(bigram freq(w1,w2)/unigram freq(w1) + | |
# (coff3*(unigram freq(w1)/total no of unigram)) | |
# | |
#------------------------------------------------------------------------------- | |
import re | |
coff1 = 0.5 #coff for trigram freq | |
coff2 = 0.3 #coff for bigram freq | |
coff3 = 0.2 #coff for unigram freq | |
def deleted_inter(traindata, testdata): | |
print ("starting processing the TRAINING data") | |
trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams | |
totBiTrain = 0 #total bigrams in training data set | |
totUniTrain = 0 # total unigrams in training data set | |
totTriTrain = 0 # total trigrams in training data set | |
testsentCount = 0 # total sentences in training data set | |
# Loop through the training data to build the unigram and bigram | |
# training dictionaries | |
for line in open(traindata): | |
line = line.rstrip() | |
testsentCount += 1 | |
words = wordTokenizier(line) | |
last_w = "UNK" | |
prev_w = "UNK" | |
for w in words: | |
if w in trainUni: | |
trainUni[w] += 1 | |
else: | |
trainUni[w] = 1 | |
totUniTrain += 1 | |
biw = prev_w + " " + w | |
if biw in trainBi: | |
trainBi[biw] += 1 | |
else: | |
trainBi[biw] = 1 | |
totBiTrain +=1 | |
triw = last_w + " " + prev_w + " " + w | |
if triw in trainTri: | |
trainTri[triw] += 1 | |
else: | |
trainTri[triw] = 1 | |
totTriTrain +=1 | |
last_w = prev_w | |
prev_w = w | |
trainUni["UNK"] = testsentCount | |
trainBi ["UNK" + " " + "UNK"] = testsentCount | |
print ("training file processed with total lines ", testsentCount) | |
# Loop through the test data to calculate sentence probability | |
# using deleted interpolation using the dictionaries built from training data | |
sentCount = 0 | |
for line in open(testdata): | |
#print ("line ", line) | |
line = line.rstrip() | |
sentCount += 1 | |
words = wordTokenizier(line) | |
sentTri = {} | |
last_w = "UNK" | |
prev_w = "UNK" | |
#determine all trigrams in the sentence | |
for w in words: | |
triw = last_w + " " + prev_w + " " + w | |
if triw in sentTri: | |
sentTri[triw] += 1 | |
else: | |
sentTri[triw] = 1 | |
last_w = prev_w | |
prev_w = w | |
# calculate probability from dictionaries built from training data | |
# for all trigrams found in the sentence | |
sentProb = 1 | |
for k in sentTri.keys(): | |
triwordlist = wordTokenizier(k) | |
w1 = triwordlist[0] | |
w2 = triwordlist [1] | |
w1w2 = w1 + " " + w2 | |
#deleted interpolation | |
sentProb = sentProb*( \ | |
(coff1*trainTri[k]/trainBi[w1w2]) + \ | |
(coff2*trainBi[w1w2]/trainUni[w1])+ \ | |
(coff3*trainUni[w1]/totUniTrain) \ | |
) | |
print ("Sentence no", sentCount, "Probability", sentProb) | |
#print ("all bigrams ", testBi) | |
print ("Completed procesing of test data") | |
def wordTokenizier(line): | |
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]" | |
tokenList = re.findall(delimiters, line) | |
return tokenList | |
# set the training and test data sets | |
# call the deleted interpolation function | |
traindata = "C:\Python34\Data\TOYDataEnglish.txt" | |
testdata = "C:\Python34\Data\TOYDataEnglish.txt" | |
deleted_inter (traindata, testdata) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# | |
# This module determines the sentence probability for all sentences in the test | |
# data set using Laplace's smoothing alrogithm | |
# | |
#------------------------------------------------------------------------------- | |
import re | |
def Lapl_Smth(traindata, testdata): | |
print ("starting processing trainin data") | |
trainUni, trainBi = {}, {} # dictionaries to hold training unigrams and bigrams | |
totBiTrain = 0 | |
totUniTrain = 0 | |
testsentCount = 0 | |
# Loop through train data set to build the unigram and bigram test dict | |
for line in open(traindata): | |
line = line.rstrip() | |
testsentCount += 1 | |
words = wordTokenizier(line) | |
prev_w = "UNK" | |
for w in words: | |
if w in trainUni: | |
trainUni[w] += 1 | |
else: | |
trainUni[w] = 1 | |
totUniTrain += 1 | |
biw = prev_w + " " + w | |
if biw in trainBi: | |
trainBi[biw] += 1 | |
else: | |
trainBi[biw] = 1 | |
prev_w = w | |
totBiTrain +=1 | |
trainUni["UNK"] = testsentCount | |
print ("training data processed with total lines ", testsentCount) | |
# Loop through the test data | |
# calculate sentence probability using Laplace's smoothing | |
sentCount = 0 | |
for line in open(testdata): | |
#print ("line ", line) | |
line = line.rstrip() | |
sentCount += 1 | |
words = wordTokenizier(line) | |
sentUni, sentBi = {}, {} | |
prev_w = "UNK" | |
for w in words: | |
if w in sentUni: | |
sentUni[w] += 1 | |
else: | |
sentUni[w] = 1 | |
biw = prev_w + " " + w | |
if biw in sentBi: | |
sentBi[biw] += 1 | |
else: | |
sentBi[biw] = 1 | |
prev_w = w | |
sentProb = 1 | |
for k in sentBi.keys(): | |
biwordlist = wordTokenizier(k) | |
sentProb = sentProb*((trainBi[k] + 1)/(totUniTrain + trainUni[biwordlist[0]])) | |
print ("Sentence no", sentCount, "Probability ", sentProb) | |
print ("Test Data processed") | |
def wordTokenizier(line): | |
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]" | |
tokenList = re.findall(delimiters, line) | |
return tokenList | |
traindata = "C:\Python34\Data\TOYDataEnglish.txt" | |
testdata = "C:\Python34\Data\TOYDataEnglish.txt" | |
Lapl_Smth (traindata, testdata) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment