Created
October 7, 2015 05:54
-
-
Save catermelon/eb9d6bfb6f8f655cb979 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Allows scoring of text using n-gram probabilities | |
17/07/12 | |
''' | |
from math import log10 | |
# make a class called ngram_score | |
# which is wrong, Python classes are always CamelCased so it should be NGramScore but whatever | |
class ngram_score(object): | |
# this is the constructor, it runs when we create a new object from this class | |
def __init__(self,ngramfile,sep=' '): | |
''' load a file containing ngrams and counts, calculate log probabilities ''' | |
self.ngrams = {} # create a hash called ngrams, store it as an instance variable in this object | |
for line in open(ngramfile, 'r'): # open the file called ngrams, and iterate over each line | |
key,count = line.split(sep) # split the line at whitespace & store in key, count variables | |
self.ngrams[key] = int(count) # store the count in the hash & convert it to an integer | |
self.L = len(key) # calculcate the length of the last key we saw and store it in the object | |
self.N = sum(self.ngrams.itervalues()) # sum all the values in the hash together, i.e. all counts | |
for key in self.ngrams.keys(): # for each key in the hash | |
self.ngrams[key] = log10(float(self.ngrams[key])/self.N) # do a thing and overwrite the value in the hash | |
self.floor = log10(0.01/self.N) # idk math or something | |
def score(self,text): | |
''' compute the score of text ''' | |
score = 0 | |
# The c way of saying this next line would be: | |
# limit = len(text)-self.L+1 | |
# for ((x=0; x<limit; x++)) | |
for i in xrange(len(text)-self.L+1): | |
# This is how you get substrings in Python | |
# text is a string, so this is text[start:end] | |
key = text[i:i+self.L] | |
# if the key is in the hash we already calculated | |
if key in self.ngrams: | |
# add the value of that key to the score | |
score += self.ngrams[key] | |
else: | |
# if not, use this other wierd value we calculated | |
score += self.floor | |
return score |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment