Created
May 17, 2016 00:22
-
-
Save neuman/366e24c3aa8eb93514e20201b1c1589e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division, unicode_literals | |
import math | |
from textblob import TextBlob as tb | |
from goose import Goose | |
def tf(word, blob): | |
return blob.words.count(word) / len(blob.words) | |
def n_containing(word, bloblist): | |
return sum(1 for blob in bloblist if word in blob) | |
def idf(word, bloblist): | |
return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) | |
def tfidf(word, blob, bloblist): | |
return tf(word, blob) * idf(word, bloblist) | |
def find_keywords_across_articles(articles): | |
g = Goose() | |
bloblist = [] | |
for a in articles: | |
try: | |
blob = tb(g.extract(url=a.url).cleaned_text) | |
bloblist.append(blob) | |
except Exception as e: | |
print 'problem!' | |
for i, blob in enumerate(bloblist): | |
print("Top words in document {}".format(i + 1)) | |
scores = {word: tfidf(word, blob, bloblist) for word in blob.words} | |
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
for word, score in sorted_words[:6]: | |
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment