Skip to content

Instantly share code, notes, and snippets.

@ryosuzuki
Last active August 29, 2015 14:23
Show Gist options
  • Save ryosuzuki/59ac5d288378353231ce to your computer and use it in GitHub Desktop.
Save ryosuzuki/59ac5d288378353231ce to your computer and use it in GitHub Desktop.
Classification of CNN articles with LDA
import feedparser
import re
import json
import os.path
from gensim import corpora, models, similarities
from newspaper import Article
texts = []
if not os.path.isfile('data.json'):
print('get articles from cnn.com')
rss = 'http://rss.cnn.com/rss/cnn_topstories.rss'
# rss = 'http://rss.cnn.com/rss/cnn_tech.rss'
d = feedparser.parse(rss)
links = [entry.link for entry in d.entries]
documents = []
for link in links:
article = Article(link)
article.download()
article.parse()
title = article.title
text = article.text
documents.append(title + ' ' + text)
print(len(documents))
stoplist = set('this it is i we are for a an of the and to in at photos image video caption hide'.split())
texts = [[word for word in re.sub("[^a-zA-Z]", " ", document.lower()).split() if word not in stoplist]
for document in documents]
with open('data.json', 'w') as file:
json.dump(texts, file)
else:
print('load data.json')
with open('data.json') as file:
texts = json.load(file)
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_above=0.3)
dictionary.values()
corpus = [dictionary.doc2bow(text) for text in texts]
LdaModel = models.ldamodel.LdaModel
lda = LdaModel(corpus, num_topics=4)
# print(dictionary.id2token)
# print(lda.show_topics())
for i in range(0, lda.num_topics-1):
items = lda.show_topic(i)
keywords = []
for item in items:
score = item[0]
key = int(item[1])
token = dictionary.id2token[key]
# keywords.append({ 'token': token, 'score': score})
keywords.append(token)
print(keywords)
"""
Output:
['women', 'cup', 'clinton', 'game', 'york', 'black', 'left', 'president', 'change', 'white']
['cup', 'game', 'women', 'clinton', 'change', 'left', 'president', 'white', 'says', 'eastern']
['al', 'leaders', 'groups', 'clinton', 'leader', 'group', 'isis', 'change', 'how', 'him']
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment