Created
August 4, 2017 16:14
-
-
Save hokuma/fa66b6e1fb19be5da995a681fbe6689f to your computer and use it in GitHub Desktop.
lda
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import string | |
import json | |
from pprint import pprint | |
from gensim import corpora, models | |
from collections import defaultdict | |
def stopword(token): | |
return string.punctuation.find(token) > -1 | |
docs = [] | |
for line in sys.stdin: | |
line = line.rstrip('\n') | |
docs.append(line.split(',')) | |
docs = [[token for token in doc if not stopword(token)] | |
for doc in docs] | |
frequency = defaultdict(int) | |
for doc in docs: | |
for token in doc: | |
frequency[token] += 1 | |
docs = [[token for token in doc if frequency[token] > 1] | |
for doc in docs] | |
dictionary = corpora.Dictionary(docs) | |
corpus = [dictionary.doc2bow(doc) for doc in docs] | |
lda = models.LdaModel(corpus, num_topics = 10, id2word = dictionary) | |
topics = [] | |
for i in range(10): | |
words = lda.show_topic(i, topn=10) | |
topics.append([{word[0]: word[1]} for word in words]) | |
print(json.dumps(topics)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment