Skip to content

Instantly share code, notes, and snippets.

@gupul2k
Last active December 20, 2015 09:48
Show Gist options
  • Save gupul2k/6110094 to your computer and use it in GitHub Desktop.
Save gupul2k/6110094 to your computer and use it in GitHub Desktop.
This implements cosine measure and compares two English sentences (code is Python with libraries from NLTK, NUMPY)
import re
import nltk
from numpy import zeros,dot
from numpy.linalg import norm
# Get stop words
stop_words = [w.strip() for w in open('C:\FWs.txt','r').readlines()]
splitter = re.compile ( "[a-z\-']+", re.I )
stemmer = nltk.PorterStemmer()
def add_word(word,d):
#If word not in stop_words list, gets stemmed version of words and adds to dictionary (with count)
w = word.lower()
if w not in stop_words:
ws = stemmer.stem(w)
d.setdefault(ws,0)
d[ws] += 1
def doc_vec(doc,key_idx):
vec = zeros(len(key_idx))
for word in splitter.findall(doc):
keydata=key_idx.get(stemmer.stem(word).lower(), None)
if keydata: vec[keydata[0]] = 1
return vec
def compare(doc1,doc2):
# strip all punctuation but - and '
# convert to lower case store word/occurance in dict
all_words = dict()
for dat in [doc1,doc2]:
#print dat
[add_word(w,all_words) for w in splitter.findall(dat)]
# Build an index of keys so that we know the word positions for the vector
key_idx = dict() # key-> ( position, count )
keys = all_words.keys()
keys.sort()
#print keys
for i in range(len(keys)):
key_idx[keys[i]] = (i,all_words[keys[i]])
del keys
del all_words
vec1=doc_vec(doc1, key_idx)
vec2=doc_vec(doc2, key_idx)
return float(dot(vec1,vec2) / (norm(vec1) * norm(vec2)))
if 1==1:
print "Running Test..."
doc1="I see food"
doc2="I like sea food"
print "Using Doc1: %s\n\nUsing Doc2: %s\n" % ( doc1, doc2 )
print "Similarity %s" % compare(doc1,doc2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment