ryosuzuki · August 29, 2015 14:23
diff --git a/lda.py b/lda.py
 import feedparser
 import re
 import json
 import os.path
 from gensim import corpora, models, similarities
 from newspaper import Article

 texts = []

 if not os.path.isfile('data.json'):
  print('get articles from cnn.com')
  rss = 'http://rss.cnn.com/rss/cnn_topstories.rss'
  # rss = 'http://rss.cnn.com/rss/cnn_tech.rss'
  d = feedparser.parse(rss)
  links = [entry.link for entry in d.entries]

  documents = []

  for link in links:
    article = Article(link)
    article.download()
    article.parse()
    title = article.title
    text = article.text
    documents.append(title + ' ' + text)

  print(len(documents))

  stoplist = set('this it is i we are for a an of the and to in at photos image video caption hide'.split())
  texts = [[word for word in re.sub("[^a-zA-Z]", " ", document.lower()).split() if word not in stoplist]
           for document in documents]

  with open('data.json', 'w') as file:
    json.dump(texts, file)
 else:
  print('load data.json')
  with open('data.json') as file:
    texts = json.load(file)

 dictionary = corpora.Dictionary(texts)
 dictionary.filter_extremes(no_above=0.3)
 dictionary.values()

 corpus = [dictionary.doc2bow(text) for text in texts]

 LdaModel = models.ldamodel.LdaModel
 lda = LdaModel(corpus, num_topics=4)

 # print(dictionary.id2token)
 # print(lda.show_topics())


 for i in range(0, lda.num_topics-1):
  items = lda.show_topic(i)
  keywords = []
  for item in items:
    score = item[0]
    key = int(item[1])
    token = dictionary.id2token[key]
    # keywords.append({ 'token': token, 'score': score})
    keywords.append(token)
  print(keywords)

 """
 Output:
 ['women', 'cup', 'clinton', 'game', 'york', 'black', 'left', 'president', 'change', 'white']
 ['cup', 'game', 'women', 'clinton', 'change', 'left', 'president', 'white', 'says', 'eastern']
 ['al', 'leaders', 'groups', 'clinton', 'leader', 'group', 'isis', 'change', 'how', 'him']
 """
	import feedparser
	import re
	import json
	import os.path
	from gensim import corpora, models, similarities
	from newspaper import Article

	texts = []

	if not os.path.isfile('data.json'):
	print('get articles from cnn.com')
	rss = 'http://rss.cnn.com/rss/cnn_topstories.rss'
	# rss = 'http://rss.cnn.com/rss/cnn_tech.rss'
	d = feedparser.parse(rss)
	links = [entry.link for entry in d.entries]

	documents = []

	for link in links:
	article = Article(link)
	article.download()
	article.parse()
	title = article.title
	text = article.text
	documents.append(title + ' ' + text)

	print(len(documents))

	stoplist = set('this it is i we are for a an of the and to in at photos image video caption hide'.split())
	texts = [[word for word in re.sub("[^a-zA-Z]", " ", document.lower()).split() if word not in stoplist]
	for document in documents]

	with open('data.json', 'w') as file:
	json.dump(texts, file)
	else:
	print('load data.json')
	with open('data.json') as file:
	texts = json.load(file)

	dictionary = corpora.Dictionary(texts)
	dictionary.filter_extremes(no_above=0.3)
	dictionary.values()

	corpus = [dictionary.doc2bow(text) for text in texts]

	LdaModel = models.ldamodel.LdaModel
	lda = LdaModel(corpus, num_topics=4)

	# print(dictionary.id2token)
	# print(lda.show_topics())


	for i in range(0, lda.num_topics-1):
	items = lda.show_topic(i)
	keywords = []
	for item in items:
	score = item[0]
	key = int(item[1])
	token = dictionary.id2token[key]
	# keywords.append({ 'token': token, 'score': score})
	keywords.append(token)
	print(keywords)

	"""
	Output:
	['women', 'cup', 'clinton', 'game', 'york', 'black', 'left', 'president', 'change', 'white']
	['cup', 'game', 'women', 'clinton', 'change', 'left', 'president', 'white', 'says', 'eastern']
	['al', 'leaders', 'groups', 'clinton', 'leader', 'group', 'isis', 'change', 'how', 'him']
	"""