Created
August 16, 2016 08:43
-
-
Save graph226/9336deab4e0197ad99ecf42fd01d2fe4 to your computer and use it in GitHub Desktop.
Visualising line messages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
import csv | |
import MeCab | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
import re | |
STOP_WORDS = "の もの こと よう これ 一 http:// の 笑 ω 物 とき ら 画像 Sticker Photo".split() | |
STOP_NOUN_DETAILS = "数 接続助詞 接尾 代名詞 非自立 副詞可能".split() | |
def get_nouns(string): | |
tagger = MeCab.Tagger() | |
text = str(string) | |
node = tagger.parseToNode(text) | |
nouns = [] | |
while node: | |
word_detail = node.feature.split(",") | |
pos = node.feature.split(",")[0] | |
word = node.surface | |
noun_detail = word_detail[1] | |
if pos == "名詞": | |
if noun_detail not in STOP_NOUN_DETAILS: | |
if word not in STOP_WORDS: | |
nouns.append(word) | |
node = node.next | |
return nouns | |
def main(): | |
fpath = "/Library/Fonts/Yu Gothic Medium.otf" | |
lines = open('talk.txt','r').readlines() | |
posts = [] | |
for line in lines: | |
usual = re.match("\d\d:\d\d\t.*\t", line) | |
if usual: | |
posts.append(line[usual.end():].rstrip()) | |
elif re.match('.*"', line): | |
posts.append(line.rstrip()) | |
else: | |
continue | |
nouns_all = [] | |
for post in posts: | |
nouns_in_row = get_nouns(post) | |
nouns_all.extend(nouns_in_row) | |
lst_wordcloud = " ".join(nouns_all).decode('utf-8') | |
wordcloud = WordCloud( | |
background_color = "white", | |
font_path = fpath, | |
width = 1280, | |
height = 720).generate(lst_wordcloud) | |
plt.figure(figsize=(16,9)) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.show() | |
## Execute | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment