Skip to content

Instantly share code, notes, and snippets.

@graph226
Created August 16, 2016 08:43
Show Gist options
  • Save graph226/9336deab4e0197ad99ecf42fd01d2fe4 to your computer and use it in GitHub Desktop.
Save graph226/9336deab4e0197ad99ecf42fd01d2fe4 to your computer and use it in GitHub Desktop.
Visualising line messages
#coding: utf-8
import csv
import MeCab
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
STOP_WORDS = "の もの こと よう これ 一 http:// の 笑 ω 物 とき ら 画像 Sticker Photo".split()
STOP_NOUN_DETAILS = "数 接続助詞 接尾 代名詞 非自立 副詞可能".split()
def get_nouns(string):
tagger = MeCab.Tagger()
text = str(string)
node = tagger.parseToNode(text)
nouns = []
while node:
word_detail = node.feature.split(",")
pos = node.feature.split(",")[0]
word = node.surface
noun_detail = word_detail[1]
if pos == "名詞":
if noun_detail not in STOP_NOUN_DETAILS:
if word not in STOP_WORDS:
nouns.append(word)
node = node.next
return nouns
def main():
fpath = "/Library/Fonts/Yu Gothic Medium.otf"
lines = open('talk.txt','r').readlines()
posts = []
for line in lines:
usual = re.match("\d\d:\d\d\t.*\t", line)
if usual:
posts.append(line[usual.end():].rstrip())
elif re.match('.*"', line):
posts.append(line.rstrip())
else:
continue
nouns_all = []
for post in posts:
nouns_in_row = get_nouns(post)
nouns_all.extend(nouns_in_row)
lst_wordcloud = " ".join(nouns_all).decode('utf-8')
wordcloud = WordCloud(
background_color = "white",
font_path = fpath,
width = 1280,
height = 720).generate(lst_wordcloud)
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
## Execute
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment