Pinak-Chakraborty · August 31, 2014 20:06
diff --git a/Smth_Backoff b/Smth_Backoff
 #-------------------------------------------------------------------------------
 #
 # This module determines the sentence probability for all sentences in the test
 # data set. It uses Back off method in determing the probability
 #
 #   prob(w1,w2,w3) is calculated as follows:
 #
 #   if trigram count(w1,w2,w3) not zero
 #       prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2)
 #   else if bigram count (w1,w2) not zero
 #       prob = coff2*bigram count (w1,w2)/unigram count (w1)
 #   else
 #       prob = coff2*unigram count (w1)/total no of unigrams
 #
 #-------------------------------------------------------------------------------

 import re

 coff1 = 0.5 #coff for trigram freq
 coff2 = 0.3 #coff for bigram freq
 coff3 = 0.2 #coff for unigram freq

 def deleted_inter(traindata, testdata):
    print ("starting processing the TRAINING data")
    trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
    totBiTrain = 0      #total bigrams in training data set
    totUniTrain = 0     # total unigrams in training data set
    totTriTrain = 0     # total trigrams in training data set
    testsentCount = 0   # total sentences in training data set
    
 # Loop through the training data to build the unigram and bigram
 # training dictionaries

    for line in open(traindata):
        line = line.rstrip()
        testsentCount += 1
        words = wordTokenizier(line)
        last_w = "UNK"
        prev_w = "UNK"
                
        for w in words:
            if w in trainUni:
                trainUni[w] += 1
            else:
                trainUni[w] = 1
            totUniTrain += 1
            
            biw = prev_w + " " + w
            if biw in trainBi:
                trainBi[biw] += 1
            else:
                trainBi[biw] = 1
            totBiTrain +=1
            
            triw = last_w + " " + prev_w + " " + w
            if triw in trainTri:
                trainTri[triw] += 1
            else:
                trainTri[triw] = 1
            totTriTrain +=1

            last_w = prev_w
            prev_w = w
            
    trainUni["UNK"] = testsentCount 
    trainBi ["UNK" + " " + "UNK"] = testsentCount
    
    print ("training file processed with total lines ", testsentCount)

 # Loop through the test data to calculate sentence probability
 # using deleted interpolation using the dictionaries built from training data

    sentCount = 0
    for line in open(testdata):
        #print ("line ", line)
        line = line.rstrip()
        sentCount += 1
        words = wordTokenizier(line)
                 
        sentTri = {}
        last_w = "UNK"
        prev_w = "UNK"

        #determine all trigrams in the sentence
        for w in words:
            triw = last_w + " " + prev_w + " " + w
            if triw in sentTri:
                sentTri[triw] += 1
            else:
                sentTri[triw] = 1
            last_w = prev_w 
            prev_w = w
            
        # calculate probability from dictionaries built from training data
        # for all trigrams found in the sentence
        sentProb = 1
        for k in sentTri.keys():
            triwordlist = wordTokenizier(k)
            w1 = triwordlist[0]
            w2 = triwordlist [1]
            w1w2 = w1 + " " + w2
            
        # back off to next lower level for unseen trigrams and bigrams
            if trainTri.get(k,0) > 0:
                sentProb = sentProb*(coff1*trainTri[k]/trainBi[w1w2])
            elif trainBi.get(w1w2,0) > 0:
                sentProb = sentProb*(coff2*trainBi[w1w2]/trainUni[w1])
            else:
                sentProb = sentProb*(coff3*trainUi[w1]/totUniTrain)
                                     
        print ("Sentence no", sentCount, "Probability", sentProb)
        #print ("all bigrams ", testBi)
        
    print ("Completed procesing of TEST data")    

 def wordTokenizier(line):
    delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
    tokenList = re.findall(delimiters, line)
    return tokenList

 # set the training and test data sets
 # call the deleted interpolation function

 traindata = "C:\Python34\Data\TOYDataEnglish.txt"
 testdata = "C:\Python34\Data\TOYDataEnglish.txt"
 deleted_inter (traindata, testdata)
	#-------------------------------------------------------------------------------
	#
	# This module determines the sentence probability for all sentences in the test
	# data set. It uses Back off method in determing the probability
	#
	# prob(w1,w2,w3) is calculated as follows:
	#
	# if trigram count(w1,w2,w3) not zero
	# prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2)
	# else if bigram count (w1,w2) not zero
	# prob = coff2*bigram count (w1,w2)/unigram count (w1)
	# else
	# prob = coff2*unigram count (w1)/total no of unigrams
	#
	#-------------------------------------------------------------------------------

	import re

	coff1 = 0.5 #coff for trigram freq
	coff2 = 0.3 #coff for bigram freq
	coff3 = 0.2 #coff for unigram freq

	def deleted_inter(traindata, testdata):
	print ("starting processing the TRAINING data")
	trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
	totBiTrain = 0 #total bigrams in training data set
	totUniTrain = 0 # total unigrams in training data set
	totTriTrain = 0 # total trigrams in training data set
	testsentCount = 0 # total sentences in training data set

	# Loop through the training data to build the unigram and bigram
	# training dictionaries

	for line in open(traindata):
	line = line.rstrip()
	testsentCount += 1
	words = wordTokenizier(line)
	last_w = "UNK"
	prev_w = "UNK"

	for w in words:
	if w in trainUni:
	trainUni[w] += 1
	else:
	trainUni[w] = 1
	totUniTrain += 1

	biw = prev_w + " " + w
	if biw in trainBi:
	trainBi[biw] += 1
	else:
	trainBi[biw] = 1
	totBiTrain +=1

	triw = last_w + " " + prev_w + " " + w
	if triw in trainTri:
	trainTri[triw] += 1
	else:
	trainTri[triw] = 1
	totTriTrain +=1

	last_w = prev_w
	prev_w = w

	trainUni["UNK"] = testsentCount
	trainBi ["UNK" + " " + "UNK"] = testsentCount

	print ("training file processed with total lines ", testsentCount)

	# Loop through the test data to calculate sentence probability
	# using deleted interpolation using the dictionaries built from training data

	sentCount = 0
	for line in open(testdata):
	#print ("line ", line)
	line = line.rstrip()
	sentCount += 1
	words = wordTokenizier(line)

	sentTri = {}
	last_w = "UNK"
	prev_w = "UNK"

	#determine all trigrams in the sentence
	for w in words:
	triw = last_w + " " + prev_w + " " + w
	if triw in sentTri:
	sentTri[triw] += 1
	else:
	sentTri[triw] = 1
	last_w = prev_w
	prev_w = w

	# calculate probability from dictionaries built from training data
	# for all trigrams found in the sentence
	sentProb = 1
	for k in sentTri.keys():
	triwordlist = wordTokenizier(k)
	w1 = triwordlist[0]
	w2 = triwordlist [1]
	w1w2 = w1 + " " + w2

	# back off to next lower level for unseen trigrams and bigrams
	if trainTri.get(k,0) > 0:
	sentProb = sentProb(coff1trainTri[k]/trainBi[w1w2])
	elif trainBi.get(w1w2,0) > 0:
	sentProb = sentProb(coff2trainBi[w1w2]/trainUni[w1])
	else:
	sentProb = sentProb(coff3trainUi[w1]/totUniTrain)

	print ("Sentence no", sentCount, "Probability", sentProb)
	#print ("all bigrams ", testBi)

	print ("Completed procesing of TEST data")

	def wordTokenizier(line):
	delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+\|[.,!;:()^*'-/]"
	tokenList = re.findall(delimiters, line)
	return tokenList

	# set the training and test data sets
	# call the deleted interpolation function

	traindata = "C:\Python34\Data\TOYDataEnglish.txt"
	testdata = "C:\Python34\Data\TOYDataEnglish.txt"
	deleted_inter (traindata, testdata)