jahentao · July 23, 2018 10:37
diff --git a/process.py b/process.py
 # -*- coding: utf-8 -*-

 from gensim import corpora
 from gensim.models import LdaModel
 from nltk.corpus import stopwords
 # 解析body域的HTML
 from bs4 import BeautifulSoup
 # 去除英文停用词，要先下载
 from nltk.corpus import stopwords
 # 去掉HTML标签
 from HTMLParser import HTMLParser
 # 去掉URL、数字
 import re
 # 去掉标点符号
 import string
 # 操纵MongoDB
 import pymongo
 # 抑制gensim警告
 import warnings
 # FIXING BUG
 import sys

 warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')

 # class to strip html
 class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)

 stop = set(stopwords.words('english'))

 def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

 # 过滤每一行内容，形成texts，作为dictionary生成的输入
 def filter_lines(lines):
    texts = []
    for line in lines:
        # 去除标签，只保留内容
        stripped_line = strip_tags(line)

        # 去掉 URL
        stripped_line = re.sub(
            r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))',
            '', stripped_line)

        # 去掉空白字符
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        stripped_line = regex.sub(' ', stripped_line)

        # 去掉数字
        stripped_line = re.sub(" \d+", " ", stripped_line)

        # 去掉各种停用词
        text = [word for word in stripped_line.lower().split() if word not in stop]
        if text:
            texts.append(text)

    return texts

 reload(sys)
 sys.setdefaultencoding('utf8')


 db = pymongo.MongoClient().question
 coll = db.question

 index = 0

 for cursor in coll.find():
    _id = cursor['_id']
    title = cursor['title']
    body = cursor['body']

    soup = BeautifulSoup(body, "html.parser")
    # 去掉<code></code>
    for code in soup("code"):
        code.decompose()
    body = str(soup)

    lines = body.split('\n')
    # 标题也包含重要信息
    lines.append(title)

    texts = filter_lines(lines)
    # print(texts)

    # 生成词典
    dictionary = corpora.Dictionary(texts)

    # 根据词典，编号文档，生成向量形式
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 训练生成当前语料库的主题
    goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1)

    topics = []
    for topic_id, topic in goodLdaModel.show_topics(num_topics=1, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
    # 打印出主题词
    # print(topics[0])
    # 添加主题词 字段
    coll.update({"_id":_id},{"$set":{"topic_words":topics[0]}})

    index = index + 1
    print(index)
    # if index > 0:
    #     break
	# -- coding: utf-8 --

	from gensim import corpora
	from gensim.models import LdaModel
	from nltk.corpus import stopwords
	# 解析body域的HTML
	from bs4 import BeautifulSoup
	# 去除英文停用词，要先下载
	from nltk.corpus import stopwords
	# 去掉HTML标签
	from HTMLParser import HTMLParser
	# 去掉URL、数字
	import re
	# 去掉标点符号
	import string
	# 操纵MongoDB
	import pymongo
	# 抑制gensim警告
	import warnings
	# FIXING BUG
	import sys

	warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')

	# class to strip html
	class MLStripper(HTMLParser):
	def __init__(self):
	self.reset()
	self.fed = []

	def handle_data(self, d):
	self.fed.append(d)

	def get_data(self):
	return ''.join(self.fed)

	stop = set(stopwords.words('english'))

	def strip_tags(html):
	s = MLStripper()
	s.feed(html)
	return s.get_data()

	# 过滤每一行内容，形成texts，作为dictionary生成的输入
	def filter_lines(lines):
	texts = []
	for line in lines:
	# 去除标签，只保留内容
	stripped_line = strip_tags(line)

	# 去掉 URL
	stripped_line = re.sub(
	r'(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?]))',
	'', stripped_line)

	# 去掉空白字符
	regex = re.compile('[%s]' % re.escape(string.punctuation))
	stripped_line = regex.sub(' ', stripped_line)

	# 去掉数字
	stripped_line = re.sub(" \d+", " ", stripped_line)

	# 去掉各种停用词
	text = [word for word in stripped_line.lower().split() if word not in stop]
	if text:
	texts.append(text)

	return texts

	reload(sys)
	sys.setdefaultencoding('utf8')


	db = pymongo.MongoClient().question
	coll = db.question

	index = 0

	for cursor in coll.find():
	_id = cursor['_id']
	title = cursor['title']
	body = cursor['body']

	soup = BeautifulSoup(body, "html.parser")
	# 去掉<code></code>
	for code in soup("code"):
	code.decompose()
	body = str(soup)

	lines = body.split('\n')
	# 标题也包含重要信息
	lines.append(title)

	texts = filter_lines(lines)
	# print(texts)

	# 生成词典
	dictionary = corpora.Dictionary(texts)

	# 根据词典，编号文档，生成向量形式
	corpus = [dictionary.doc2bow(text) for text in texts]

	# 训练生成当前语料库的主题
	goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=1)

	topics = []
	for topic_id, topic in goodLdaModel.show_topics(num_topics=1, formatted=False):
	topic = [word for word, _ in topic]
	topics.append(topic)
	# 打印出主题词
	# print(topics[0])
	# 添加主题词字段
	coll.update({"_id":_id},{"$set":{"topic_words":topics[0]}})

	index = index + 1
	print(index)
	# if index > 0:
	# break