Last active
August 29, 2015 14:23
-
-
Save ryosuzuki/59ac5d288378353231ce to your computer and use it in GitHub Desktop.
Classification of CNN articles with LDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import feedparser | |
import re | |
import json | |
import os.path | |
from gensim import corpora, models, similarities | |
from newspaper import Article | |
texts = [] | |
if not os.path.isfile('data.json'): | |
print('get articles from cnn.com') | |
rss = 'http://rss.cnn.com/rss/cnn_topstories.rss' | |
# rss = 'http://rss.cnn.com/rss/cnn_tech.rss' | |
d = feedparser.parse(rss) | |
links = [entry.link for entry in d.entries] | |
documents = [] | |
for link in links: | |
article = Article(link) | |
article.download() | |
article.parse() | |
title = article.title | |
text = article.text | |
documents.append(title + ' ' + text) | |
print(len(documents)) | |
stoplist = set('this it is i we are for a an of the and to in at photos image video caption hide'.split()) | |
texts = [[word for word in re.sub("[^a-zA-Z]", " ", document.lower()).split() if word not in stoplist] | |
for document in documents] | |
with open('data.json', 'w') as file: | |
json.dump(texts, file) | |
else: | |
print('load data.json') | |
with open('data.json') as file: | |
texts = json.load(file) | |
dictionary = corpora.Dictionary(texts) | |
dictionary.filter_extremes(no_above=0.3) | |
dictionary.values() | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
LdaModel = models.ldamodel.LdaModel | |
lda = LdaModel(corpus, num_topics=4) | |
# print(dictionary.id2token) | |
# print(lda.show_topics()) | |
for i in range(0, lda.num_topics-1): | |
items = lda.show_topic(i) | |
keywords = [] | |
for item in items: | |
score = item[0] | |
key = int(item[1]) | |
token = dictionary.id2token[key] | |
# keywords.append({ 'token': token, 'score': score}) | |
keywords.append(token) | |
print(keywords) | |
""" | |
Output: | |
['women', 'cup', 'clinton', 'game', 'york', 'black', 'left', 'president', 'change', 'white'] | |
['cup', 'game', 'women', 'clinton', 'change', 'left', 'president', 'white', 'says', 'eastern'] | |
['al', 'leaders', 'groups', 'clinton', 'leader', 'group', 'isis', 'change', 'how', 'him'] | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment