Skip to content

Instantly share code, notes, and snippets.

@jahentao
Created July 23, 2018 10:37
Show Gist options
  • Save jahentao/d3d6ab99ce12b230f66fa98962d94b50 to your computer and use it in GitHub Desktop.
Save jahentao/d3d6ab99ce12b230f66fa98962d94b50 to your computer and use it in GitHub Desktop.
用gensim的LDA模型,分析得到MongoDB中body的主题词,并作为新字段插入
# -*- coding: utf-8 -*-
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
# 解析body域的HTML
from bs4 import BeautifulSoup
# 去除英文停用词,要先下载
from nltk.corpus import stopwords
# 去掉HTML标签
from HTMLParser import HTMLParser
# 去掉URL、数字
import re
# 去掉标点符号
import string
# 操纵MongoDB
import pymongo
# 抑制gensim警告
import warnings
# FIXING BUG
import sys
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
# class to strip html
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
stop = set(stopwords.words('english'))
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
# 过滤每一行内容,形成texts,作为dictionary生成的输入
def filter_lines(lines):
texts = []
for line in lines:
# 去除标签,只保留内容
stripped_line = strip_tags(line)
# 去掉 URL
stripped_line = re.sub(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))',
'', stripped_line)
# 去掉空白字符
regex = re.compile('[%s]' % re.escape(string.punctuation))
stripped_line = regex.sub(' ', stripped_line)
# 去掉数字
stripped_line = re.sub(" \d+", " ", stripped_line)
# 去掉各种停用词
text = [word for word in stripped_line.lower().split() if word not in stop]
if text:
texts.append(text)
return texts
reload(sys)
sys.setdefaultencoding('utf8')
db = pymongo.MongoClient().question
coll = db.question
index = 0
for cursor in coll.find():
_id = cursor['_id']
title = cursor['title']
body = cursor['body']
soup = BeautifulSoup(body, "html.parser")
# 去掉<code></code>
for code in soup("code"):
code.decompose()
body = str(soup)
lines = body.split('\n')
# 标题也包含重要信息
lines.append(title)
texts = filter_lines(lines)
# print(texts)
# 生成词典
dictionary = corpora.Dictionary(texts)
# 根据词典,编号文档,生成向量形式
corpus = [dictionary.doc2bow(text) for text in texts]
# 训练生成当前语料库的主题
goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1)
topics = []
for topic_id, topic in goodLdaModel.show_topics(num_topics=1, formatted=False):
topic = [word for word, _ in topic]
topics.append(topic)
# 打印出主题词
# print(topics[0])
# 添加主题词 字段
coll.update({"_id":_id},{"$set":{"topic_words":topics[0]}})
index = index + 1
print(index)
# if index > 0:
# break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment