Created
July 23, 2018 10:37
-
-
Save jahentao/d3d6ab99ce12b230f66fa98962d94b50 to your computer and use it in GitHub Desktop.
用gensim的LDA模型,分析得到MongoDB中body的主题词,并作为新字段插入
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from gensim import corpora | |
from gensim.models import LdaModel | |
from nltk.corpus import stopwords | |
# 解析body域的HTML | |
from bs4 import BeautifulSoup | |
# 去除英文停用词,要先下载 | |
from nltk.corpus import stopwords | |
# 去掉HTML标签 | |
from HTMLParser import HTMLParser | |
# 去掉URL、数字 | |
import re | |
# 去掉标点符号 | |
import string | |
# 操纵MongoDB | |
import pymongo | |
# 抑制gensim警告 | |
import warnings | |
# FIXING BUG | |
import sys | |
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim') | |
# class to strip html | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
stop = set(stopwords.words('english')) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
# 过滤每一行内容,形成texts,作为dictionary生成的输入 | |
def filter_lines(lines): | |
texts = [] | |
for line in lines: | |
# 去除标签,只保留内容 | |
stripped_line = strip_tags(line) | |
# 去掉 URL | |
stripped_line = re.sub( | |
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))', | |
'', stripped_line) | |
# 去掉空白字符 | |
regex = re.compile('[%s]' % re.escape(string.punctuation)) | |
stripped_line = regex.sub(' ', stripped_line) | |
# 去掉数字 | |
stripped_line = re.sub(" \d+", " ", stripped_line) | |
# 去掉各种停用词 | |
text = [word for word in stripped_line.lower().split() if word not in stop] | |
if text: | |
texts.append(text) | |
return texts | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
db = pymongo.MongoClient().question | |
coll = db.question | |
index = 0 | |
for cursor in coll.find(): | |
_id = cursor['_id'] | |
title = cursor['title'] | |
body = cursor['body'] | |
soup = BeautifulSoup(body, "html.parser") | |
# 去掉<code></code> | |
for code in soup("code"): | |
code.decompose() | |
body = str(soup) | |
lines = body.split('\n') | |
# 标题也包含重要信息 | |
lines.append(title) | |
texts = filter_lines(lines) | |
# print(texts) | |
# 生成词典 | |
dictionary = corpora.Dictionary(texts) | |
# 根据词典,编号文档,生成向量形式 | |
corpus = [dictionary.doc2bow(text) for text in texts] | |
# 训练生成当前语料库的主题 | |
goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1) | |
topics = [] | |
for topic_id, topic in goodLdaModel.show_topics(num_topics=1, formatted=False): | |
topic = [word for word, _ in topic] | |
topics.append(topic) | |
# 打印出主题词 | |
# print(topics[0]) | |
# 添加主题词 字段 | |
coll.update({"_id":_id},{"$set":{"topic_words":topics[0]}}) | |
index = index + 1 | |
print(index) | |
# if index > 0: | |
# break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment