gupul2k · December 20, 2015 09:48
diff --git a/Cosine_Similarity_compare_sentences b/Cosine_Similarity_compare_sentences
 import re
 import nltk
 from numpy import zeros,dot
 from numpy.linalg import norm

 # Get stop words
 stop_words = [w.strip() for w in open('C:\FWs.txt','r').readlines()]

 splitter = re.compile ( "[a-z\-']+", re.I )
 stemmer = nltk.PorterStemmer()

 def add_word(word,d):
 #If word not in stop_words list, gets stemmed version of words and adds to dictionary (with count)
 w = word.lower() 
 if w not in stop_words:
  ws = stemmer.stem(w)
  d.setdefault(ws,0)
  d[ws] += 1

 def doc_vec(doc,key_idx):
 vec = zeros(len(key_idx))
 for word in splitter.findall(doc):
  keydata=key_idx.get(stemmer.stem(word).lower(), None)
  if keydata: vec[keydata[0]] = 1
 return vec

 def compare(doc1,doc2):

 # strip all punctuation but - and '
 # convert to lower case store word/occurance in dict
 all_words = dict()

 for dat in [doc1,doc2]:
  #print dat
  [add_word(w,all_words) for w in splitter.findall(dat)]
 
 # Build an index of keys so that we know the word positions for the vector
 key_idx = dict() # key-> ( position, count )
 keys = all_words.keys()
 keys.sort()
 #print keys
 for i in range(len(keys)):
  key_idx[keys[i]] = (i,all_words[keys[i]])
 del keys
 del all_words

 vec1=doc_vec(doc1, key_idx)
 vec2=doc_vec(doc2, key_idx)
 return float(dot(vec1,vec2) / (norm(vec1) * norm(vec2)))
 
 
 if 1==1: 
 print "Running Test..." 
 
 doc1="I see food"
 doc2="I like sea food"
 
 print "Using Doc1: %s\n\nUsing Doc2: %s\n" % ( doc1, doc2 )
 print "Similarity %s" % compare(doc1,doc2)
	import re
	import nltk
	from numpy import zeros,dot
	from numpy.linalg import norm

	# Get stop words
	stop_words = [w.strip() for w in open('C:\FWs.txt','r').readlines()]

	splitter = re.compile ( "[a-z\-']+", re.I )
	stemmer = nltk.PorterStemmer()

	def add_word(word,d):
	#If word not in stop_words list, gets stemmed version of words and adds to dictionary (with count)
	w = word.lower()
	if w not in stop_words:
	ws = stemmer.stem(w)
	d.setdefault(ws,0)
	d[ws] += 1

	def doc_vec(doc,key_idx):
	vec = zeros(len(key_idx))
	for word in splitter.findall(doc):
	keydata=key_idx.get(stemmer.stem(word).lower(), None)
	if keydata: vec[keydata[0]] = 1
	return vec

	def compare(doc1,doc2):

	# strip all punctuation but - and '
	# convert to lower case store word/occurance in dict
	all_words = dict()

	for dat in [doc1,doc2]:
	#print dat
	[add_word(w,all_words) for w in splitter.findall(dat)]

	# Build an index of keys so that we know the word positions for the vector
	key_idx = dict() # key-> ( position, count )
	keys = all_words.keys()
	keys.sort()
	#print keys
	for i in range(len(keys)):
	key_idx[keys[i]] = (i,all_words[keys[i]])
	del keys
	del all_words

	vec1=doc_vec(doc1, key_idx)
	vec2=doc_vec(doc2, key_idx)
	return float(dot(vec1,vec2) / (norm(vec1) * norm(vec2)))


	if 1==1:
	print "Running Test..."

	doc1="I see food"
	doc2="I like sea food"

	print "Using Doc1: %s\n\nUsing Doc2: %s\n" % ( doc1, doc2 )
	print "Similarity %s" % compare(doc1,doc2)