Pinak-Chakraborty · August 31, 2014 20:04
diff --git a/Del_Inter b/Del_Inter
 #-------------------------------------------------------------------------------
 #
 # This module determines the sentence probability for all sentences in the test
 # data set. It uses deleted inerpolation as: (for bigram)
 #
 #   prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ (
 #               (coff2*(bigram freq(w1,w2)/unigram freq(w1) +
 #                (coff3*(unigram freq(w1)/total no of unigram))
 #
 #-------------------------------------------------------------------------------

 import re

 coff1 = 0.5 #coff for trigram freq
 coff2 = 0.3 #coff for bigram freq
 coff3 = 0.2 #coff for unigram freq

 def deleted_inter(traindata, testdata):
    print ("starting processing the TRAINING data")
    trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
    totBiTrain = 0      #total bigrams in training data set
    totUniTrain = 0     # total unigrams in training data set
    totTriTrain = 0     # total trigrams in training data set
    testsentCount = 0   # total sentences in training data set
    
 # Loop through the training data to build the unigram and bigram
 # training dictionaries

    for line in open(traindata):
        line = line.rstrip()
        testsentCount += 1
        words = wordTokenizier(line)
        last_w = "UNK"
        prev_w = "UNK"
                
        for w in words:
            if w in trainUni:
                trainUni[w] += 1
            else:
                trainUni[w] = 1
            totUniTrain += 1
            
            biw = prev_w + " " + w
            if biw in trainBi:
                trainBi[biw] += 1
            else:
                trainBi[biw] = 1
            totBiTrain +=1
            
            triw = last_w + " " + prev_w + " " + w
            if triw in trainTri:
                trainTri[triw] += 1
            else:
                trainTri[triw] = 1
            totTriTrain +=1

            last_w = prev_w
            prev_w = w
            
    trainUni["UNK"] = testsentCount 
    trainBi ["UNK" + " " + "UNK"] = testsentCount
    
    print ("training file processed with total lines ", testsentCount)

 # Loop through the test data to calculate sentence probability
 # using deleted interpolation using the dictionaries built from training data

    sentCount = 0
    for line in open(testdata):
        #print ("line ", line)
        line = line.rstrip()
        sentCount += 1
        words = wordTokenizier(line)
                 
        sentTri = {}
        last_w = "UNK"
        prev_w = "UNK"

        #determine all trigrams in the sentence
        for w in words:
            triw = last_w + " " + prev_w + " " + w
            if triw in sentTri:
                sentTri[triw] += 1
            else:
                sentTri[triw] = 1
            last_w = prev_w 
            prev_w = w
            
        # calculate probability from dictionaries built from training data
        # for all trigrams found in the sentence
        sentProb = 1
        for k in sentTri.keys():
            triwordlist = wordTokenizier(k)
            w1 = triwordlist[0]
            w2 = triwordlist [1]
            w1w2 = w1 + " " + w2
            
            #deleted interpolation
            sentProb = sentProb*( \
                (coff1*trainTri[k]/trainBi[w1w2]) + \
                (coff2*trainBi[w1w2]/trainUni[w1])+ \
                (coff3*trainUni[w1]/totUniTrain) \
                )
        
        print ("Sentence no", sentCount, "Probability", sentProb)
        #print ("all bigrams ", testBi)
        
    print ("Completed procesing of test data")    

 def wordTokenizier(line):
    delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
    tokenList = re.findall(delimiters, line)
    return tokenList

 # set the training and test data sets
 # call the deleted interpolation function

 traindata = "C:\Python34\Data\TOYDataEnglish.txt"
 testdata = "C:\Python34\Data\TOYDataEnglish.txt"
 deleted_inter (traindata, testdata)
diff --git a/Lapl_Smth b/Lapl_Smth
 #-------------------------------------------------------------------------------
 #
 # This module determines the sentence probability for all sentences in the test
 # data set using Laplace's smoothing alrogithm
 #
 #-------------------------------------------------------------------------------

 import re

 def Lapl_Smth(traindata, testdata):
    print ("starting processing trainin data")
    trainUni, trainBi = {}, {} # dictionaries to hold training unigrams and bigrams
    totBiTrain = 0
    totUniTrain = 0
    testsentCount = 0

 # Loop through train data set to build the unigram and bigram test dict
    for line in open(traindata):
        line = line.rstrip()
        testsentCount += 1
        words = wordTokenizier(line)
        prev_w = "UNK"
        
        for w in words:
            if w in trainUni:
                trainUni[w] += 1
            else:
                trainUni[w] = 1
            totUniTrain += 1

            biw = prev_w + " " + w
            if biw in trainBi:
                trainBi[biw] += 1
            else:
                trainBi[biw] = 1
            prev_w = w
            totBiTrain +=1
            
    trainUni["UNK"] = testsentCount
    print ("training data processed with total lines ", testsentCount)

 # Loop through the test data
 # calculate sentence probability using Laplace's smoothing 
    sentCount = 0
    for line in open(testdata):
        #print ("line ", line)
        line = line.rstrip()
        sentCount += 1
        words = wordTokenizier(line)
                 
        sentUni, sentBi = {}, {}
        prev_w = "UNK"
        for w in words:
            if w in sentUni:
                sentUni[w] += 1
            else:
                sentUni[w] = 1

            biw = prev_w + " " + w
            if biw in sentBi:
                sentBi[biw] += 1
            else:
                sentBi[biw] = 1
            prev_w = w
            
        sentProb = 1
        for k in sentBi.keys():
            biwordlist = wordTokenizier(k)
            sentProb = sentProb*((trainBi[k] + 1)/(totUniTrain + trainUni[biwordlist[0]]))
        print ("Sentence no", sentCount, "Probability ", sentProb)

    print ("Test Data processed")

 def wordTokenizier(line):
    delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
    tokenList = re.findall(delimiters, line)
    return tokenList

 traindata = "C:\Python34\Data\TOYDataEnglish.txt"
 testdata = "C:\Python34\Data\TOYDataEnglish.txt"
 Lapl_Smth (traindata, testdata)
	#-------------------------------------------------------------------------------
	#
	# This module determines the sentence probability for all sentences in the test
	# data set. It uses deleted inerpolation as: (for bigram)
	#
	# prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ (
	# (coff2*(bigram freq(w1,w2)/unigram freq(w1) +
	# (coff3*(unigram freq(w1)/total no of unigram))
	#
	#-------------------------------------------------------------------------------

	import re

	coff1 = 0.5 #coff for trigram freq
	coff2 = 0.3 #coff for bigram freq
	coff3 = 0.2 #coff for unigram freq

	def deleted_inter(traindata, testdata):
	print ("starting processing the TRAINING data")
	trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
	totBiTrain = 0 #total bigrams in training data set
	totUniTrain = 0 # total unigrams in training data set
	totTriTrain = 0 # total trigrams in training data set
	testsentCount = 0 # total sentences in training data set

	# Loop through the training data to build the unigram and bigram
	# training dictionaries

	for line in open(traindata):
	line = line.rstrip()
	testsentCount += 1
	words = wordTokenizier(line)
	last_w = "UNK"
	prev_w = "UNK"

	for w in words:
	if w in trainUni:
	trainUni[w] += 1
	else:
	trainUni[w] = 1
	totUniTrain += 1

	biw = prev_w + " " + w
	if biw in trainBi:
	trainBi[biw] += 1
	else:
	trainBi[biw] = 1
	totBiTrain +=1

	triw = last_w + " " + prev_w + " " + w
	if triw in trainTri:
	trainTri[triw] += 1
	else:
	trainTri[triw] = 1
	totTriTrain +=1

	last_w = prev_w
	prev_w = w

	trainUni["UNK"] = testsentCount
	trainBi ["UNK" + " " + "UNK"] = testsentCount

	print ("training file processed with total lines ", testsentCount)

	# Loop through the test data to calculate sentence probability
	# using deleted interpolation using the dictionaries built from training data

	sentCount = 0
	for line in open(testdata):
	#print ("line ", line)
	line = line.rstrip()
	sentCount += 1
	words = wordTokenizier(line)

	sentTri = {}
	last_w = "UNK"
	prev_w = "UNK"

	#determine all trigrams in the sentence
	for w in words:
	triw = last_w + " " + prev_w + " " + w
	if triw in sentTri:
	sentTri[triw] += 1
	else:
	sentTri[triw] = 1
	last_w = prev_w
	prev_w = w

	# calculate probability from dictionaries built from training data
	# for all trigrams found in the sentence
	sentProb = 1
	for k in sentTri.keys():
	triwordlist = wordTokenizier(k)
	w1 = triwordlist[0]
	w2 = triwordlist [1]
	w1w2 = w1 + " " + w2

	#deleted interpolation
	sentProb = sentProb*( \
	(coff1*trainTri[k]/trainBi[w1w2]) + \
	(coff2*trainBi[w1w2]/trainUni[w1])+ \
	(coff3*trainUni[w1]/totUniTrain) \
	)

	print ("Sentence no", sentCount, "Probability", sentProb)
	#print ("all bigrams ", testBi)

	print ("Completed procesing of test data")

	def wordTokenizier(line):
	delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+\|[.,!;:()^*'-/]"
	tokenList = re.findall(delimiters, line)
	return tokenList

	# set the training and test data sets
	# call the deleted interpolation function

	traindata = "C:\Python34\Data\TOYDataEnglish.txt"
	testdata = "C:\Python34\Data\TOYDataEnglish.txt"
	deleted_inter (traindata, testdata)
	#-------------------------------------------------------------------------------
	#
	# This module determines the sentence probability for all sentences in the test
	# data set using Laplace's smoothing alrogithm
	#
	#-------------------------------------------------------------------------------

	import re

	def Lapl_Smth(traindata, testdata):
	print ("starting processing trainin data")
	trainUni, trainBi = {}, {} # dictionaries to hold training unigrams and bigrams
	totBiTrain = 0
	totUniTrain = 0
	testsentCount = 0

	# Loop through train data set to build the unigram and bigram test dict
	for line in open(traindata):
	line = line.rstrip()
	testsentCount += 1
	words = wordTokenizier(line)
	prev_w = "UNK"

	for w in words:
	if w in trainUni:
	trainUni[w] += 1
	else:
	trainUni[w] = 1
	totUniTrain += 1

	biw = prev_w + " " + w
	if biw in trainBi:
	trainBi[biw] += 1
	else:
	trainBi[biw] = 1
	prev_w = w
	totBiTrain +=1

	trainUni["UNK"] = testsentCount
	print ("training data processed with total lines ", testsentCount)

	# Loop through the test data
	# calculate sentence probability using Laplace's smoothing
	sentCount = 0
	for line in open(testdata):
	#print ("line ", line)
	line = line.rstrip()
	sentCount += 1
	words = wordTokenizier(line)

	sentUni, sentBi = {}, {}
	prev_w = "UNK"
	for w in words:
	if w in sentUni:
	sentUni[w] += 1
	else:
	sentUni[w] = 1

	biw = prev_w + " " + w
	if biw in sentBi:
	sentBi[biw] += 1
	else:
	sentBi[biw] = 1
	prev_w = w

	sentProb = 1
	for k in sentBi.keys():
	biwordlist = wordTokenizier(k)
	sentProb = sentProb*((trainBi[k] + 1)/(totUniTrain + trainUni[biwordlist[0]]))
	print ("Sentence no", sentCount, "Probability ", sentProb)

	print ("Test Data processed")

	def wordTokenizier(line):
	delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+\|[.,!;:()^*'-/]"
	tokenList = re.findall(delimiters, line)
	return tokenList

	traindata = "C:\Python34\Data\TOYDataEnglish.txt"
	testdata = "C:\Python34\Data\TOYDataEnglish.txt"
	Lapl_Smth (traindata, testdata)