Created
March 17, 2018 02:40
-
-
Save kinoko3/35d56aa34743d1cb123bb2b87310f183 to your computer and use it in GitHub Desktop.
solidot-wordcloud
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from concurrent import futures | |
import pymongo | |
from lxml import etree | |
import datetime | |
MAX_WORKERS = 8 | |
def get(url): | |
session = requests.session() | |
content = session.get(url=url).text | |
html = etree.HTML(content) | |
content = html.xpath('//*[@id="center"]/div/div[3]/div/text()') | |
if content: | |
data = [i.strip() for i in content] | |
for i in data: | |
with open('cloud_main_data.txt', 'at', encoding='UTF-8') as f: | |
f.write(i) | |
print('OK') | |
else: | |
pass | |
def get_list(url_list): | |
wokers = max(MAX_WORKERS, len(url_list)) | |
with futures.ThreadPoolExecutor(wokers) as executor: | |
executor.map(get, url_list) | |
if __name__ == '__main__': | |
begin = datetime.date(2018, 1, 1) | |
end = datetime.date(2018, 3, 12) | |
for i in range((end - begin).days + 1): | |
day = begin + datetime.timedelta(days=i) | |
url_list = ['https://www.solidot.org/?issue=' + str(begin + datetime.timedelta(days=i)).replace('-', '') | |
for i in range((end - begin).days + 1)] | |
cloud_url_list = ['https://cloud.solidot.org/?issue=' + str(begin + datetime.timedelta(days=i)).replace('-', '') | |
for i in range((end - begin).days + 1)] | |
get_list(url_list=cloud_url_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import jieba | |
import jieba.analyse | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from pprint import pprint | |
text_from_file = open('main_data.txt', 'r', encoding='UTF-8').read() | |
# Word_spilt_jieba = jieba.cut(text_from_file, cut_all=False) | |
# word_space = ' '.join(Word_spilt_jieba) | |
jieba.load_userdict("userdict.txt") | |
jieba.analyse.set_stop_words('stop_words.txt') | |
tags = jieba.analyse.extract_tags(text_from_file, topK=100, withWeight=True) | |
data_dict = {} | |
for i in tags: | |
v, k = i | |
data_dict[v] = k | |
# text = ''.join(tags) | |
wc = WordCloud( | |
font_path='simkai.ttf', | |
background_color='white', | |
max_words=100, | |
max_font_size=180, | |
random_state=42, | |
width=1500, height=1500, | |
) | |
wc.generate_from_frequencies(data_dict) | |
plt.figure() | |
plt.imshow(wc) | |
plt.axis("off") | |
plt.show() | |
# with open('tags.txt', 'w', encoding='UTF-8') as f: | |
# f.write(str(tags)) | |
wc.to_file('100.png') | |
pprint(data_dict) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment