neuman · May 17, 2016 00:22
diff --git a/similarity.py b/similarity.py
 from __future__ import division, unicode_literals
 import math
 from textblob import TextBlob as tb
 from goose import Goose

 def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

 def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

 def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

 def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
    
 def find_keywords_across_articles(articles):
 	g = Goose()
 	bloblist = []
 	for a in articles:
 		try:
 			blob = tb(g.extract(url=a.url).cleaned_text)
 			bloblist.append(blob)
 		except Exception as e:
 			print 'problem!'

 	for i, blob in enumerate(bloblist):
 	    print("Top words in document {}".format(i + 1))
 	    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
 	    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
 	    for word, score in sorted_words[:6]:
 	        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
	from __future__ import division, unicode_literals
	import math
	from textblob import TextBlob as tb
	from goose import Goose

	def tf(word, blob):
	return blob.words.count(word) / len(blob.words)

	def n_containing(word, bloblist):
	return sum(1 for blob in bloblist if word in blob)

	def idf(word, bloblist):
	return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

	def tfidf(word, blob, bloblist):
	return tf(word, blob) * idf(word, bloblist)

	def find_keywords_across_articles(articles):
	g = Goose()
	bloblist = []
	for a in articles:
	try:
	blob = tb(g.extract(url=a.url).cleaned_text)
	bloblist.append(blob)
	except Exception as e:
	print 'problem!'

	for i, blob in enumerate(bloblist):
	print("Top words in document {}".format(i + 1))
	scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
	sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
	for word, score in sorted_words[:6]:
	print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
No results found