Last active
December 20, 2015 09:48
-
-
Save gupul2k/6110094 to your computer and use it in GitHub Desktop.
This implements cosine measure and compares two English sentences (code is Python with libraries from NLTK, NUMPY)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
from numpy import zeros,dot | |
from numpy.linalg import norm | |
# Get stop words | |
stop_words = [w.strip() for w in open('C:\FWs.txt','r').readlines()] | |
splitter = re.compile ( "[a-z\-']+", re.I ) | |
stemmer = nltk.PorterStemmer() | |
def add_word(word,d): | |
#If word not in stop_words list, gets stemmed version of words and adds to dictionary (with count) | |
w = word.lower() | |
if w not in stop_words: | |
ws = stemmer.stem(w) | |
d.setdefault(ws,0) | |
d[ws] += 1 | |
def doc_vec(doc,key_idx): | |
vec = zeros(len(key_idx)) | |
for word in splitter.findall(doc): | |
keydata=key_idx.get(stemmer.stem(word).lower(), None) | |
if keydata: vec[keydata[0]] = 1 | |
return vec | |
def compare(doc1,doc2): | |
# strip all punctuation but - and ' | |
# convert to lower case store word/occurance in dict | |
all_words = dict() | |
for dat in [doc1,doc2]: | |
#print dat | |
[add_word(w,all_words) for w in splitter.findall(dat)] | |
# Build an index of keys so that we know the word positions for the vector | |
key_idx = dict() # key-> ( position, count ) | |
keys = all_words.keys() | |
keys.sort() | |
#print keys | |
for i in range(len(keys)): | |
key_idx[keys[i]] = (i,all_words[keys[i]]) | |
del keys | |
del all_words | |
vec1=doc_vec(doc1, key_idx) | |
vec2=doc_vec(doc2, key_idx) | |
return float(dot(vec1,vec2) / (norm(vec1) * norm(vec2))) | |
if 1==1: | |
print "Running Test..." | |
doc1="I see food" | |
doc2="I like sea food" | |
print "Using Doc1: %s\n\nUsing Doc2: %s\n" % ( doc1, doc2 ) | |
print "Similarity %s" % compare(doc1,doc2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment