Last active
October 12, 2017 17:38
-
-
Save EnsekiTT/c1b0c786615648662bb1fdc05a1d1d5b to your computer and use it in GitHub Desktop.
http://ensekitt.hatenablog.com/entry/tree2 はMeCabでやったのでJUMAN++でもやってみようと思ったコード。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[ ]: | |
# JUMAN++版 | |
# ツイートを集める | |
# https://github.com/sixohsix/twitter | |
# 環境変数にトークンとか入れておく | |
import os | |
from twitter import * | |
import time | |
# In[ ]: | |
# PyKNPの形態素解析 | |
from janome.charfilter import * | |
from janome.tokenfilter import * | |
from pyknp import Jumanpp | |
# 記号、助詞、助動詞は削除 | |
# 動詞は原形に変換してみる。 | |
# In[ ]: | |
# Scikit-learnの決定木と可視化 | |
from sklearn import tree | |
from sklearn.externals.six import StringIO | |
import pydotplus | |
from IPython.display import Image | |
import numpy as np | |
# In[37]: | |
# Jumanpp向けにJanomeで使えていたフィルタを使えるようにするために別途定義下。 | |
# JanomeのTokenFilterを継承して作ってるのでAnalyzerもほとんどJanomeと同じ | |
class CompoundNounFilterForJumanpp(TokenFilter): | |
def apply(self, tokens): | |
_ret = None | |
for token in tokens: | |
if _ret: | |
if token.hinsi == '名詞' and _ret.hinsi == '名詞': | |
_ret.midasi += token.midasi | |
_ret.bunrui = '複合名詞' | |
_ret.genkei += token.genkei | |
_ret.yomi += token.yomi | |
else: | |
ret = _ret | |
_ret = token | |
yield ret | |
else: | |
_ret = token | |
if _ret: | |
yield _ret | |
class POSStopFilterForJumanpp(TokenFilter): | |
""" | |
Juman++とIPAでは品詞体系が違うので注意が必要 | |
http://www.unixuser.org/~euske/doc/postag/ | |
""" | |
def __init__(self, pos_list): | |
self.pos_list = pos_list | |
def apply(self, tokens): | |
for token in tokens: | |
if any(token.hinsi == pos for pos in self.pos_list): | |
continue | |
yield token | |
class AnalyzerForJumanpp(object): | |
def __init__(self, char_filters=[], tokenizer=None, token_filters=[]): | |
self.tokenizer = tokenizer | |
self.char_filters = char_filters | |
self.token_filters = token_filters | |
def analyze(self, text): | |
for cfilter in self.char_filters: | |
text = cfilter.filter(text) | |
if text == '': | |
text = ' ' | |
tokens = tokenizer.analysis(text) | |
for tfilter in self.token_filters: | |
tokens = tfilter.filter(tokens) | |
return tokens | |
# In[5]: | |
# Twitterの設定 | |
TOKEN = os.environ["DTA_TWITTER_TOKEN"] | |
TOKEN_SECRET = os.environ["DTA_TWITTER_TOKEN_SECRET"] | |
CONSUMER_KEY = os.environ["DTA_TWITTER_CONSUMER_KEY"] | |
CONSUMER_SECRET = os.environ["DTA_TWITTER_CONSUMER_SECRET"] | |
t = Twitter( | |
auth = OAuth(TOKEN, TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)) | |
# In[6]: | |
# 設定 | |
NUM_TWEET = 200 | |
NUM_USER = 50 | |
# In[29]: | |
def get_word_count(analyzer, tweets, word_list): | |
""" | |
ワードをカウントしてくれる | |
""" | |
word_count = [0]*len(word_list) | |
for text in tweets: | |
try: | |
for token in analyzer.analyze(text): | |
#print(u"見出し:%s\n 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ | |
# % (token.midasi, token.yomi, token.genkei, token.hinsi, token.bunrui, token.katuyou1, token.katuyou2, token.imis, token.repname)) | |
if '動詞' in token.hinsi: | |
word = token.genkei | |
elif '形容詞' in token.hinsi: | |
word = token.genkei | |
else: | |
word = token.midasi | |
if word in word_list: | |
word_index = word_list.index(word) | |
word_count[word_index] += 1 | |
else: | |
word_list.append(word) | |
word_count.append(1) | |
except ValueError as e: | |
print(e) | |
print(text) | |
continue | |
return word_count | |
# In[8]: | |
# Userリストの取得 | |
engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="engineer", count=NUM_USER)['users']] | |
not_engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="notengineer", count=NUM_USER)['users']] | |
users = list(engineer) | |
users.extend(not_engineer) | |
print(users) | |
# In[9]: | |
word_list = [] | |
user_vectors = {} | |
user_vectors_raw = {} | |
last_id = 0 | |
for user in users: | |
tweets = t.statuses.user_timeline(screen_name=user, count=200, include_rts=False, exclude_replies=True) | |
#print("user :" + user) | |
while (len(tweets)< NUM_TWEET): | |
max_id = tweets[-1]['id'] | |
if max_id == last_id: | |
print("now :" + str(len(tweets)) +', '+ str(tweets[-1]['id']) + ', ' + tweets[-1]['text']) | |
print("Break!!!" + user) | |
break | |
last_id = max_id | |
tweets.extend(t.statuses.user_timeline(screen_name=users[1], count=200, include_rts=False, exclude_replies=True, max_id=max_id+1)) | |
time.sleep(1) # Twitter APIのstatuses.user_timelineの制限が15分間に900回なので念のためちょっと長めにしてある。 | |
user_vectors_raw[user]=[tweet['text'] for tweet in tweets[:NUM_TWEET]] | |
# In[10]: | |
from datetime import datetime | |
import json | |
ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
path = ts + '_tweets.json' | |
with open(path, 'w') as f: | |
json.dump(user_vectors_raw, f) | |
# In[11]: | |
len(user_vectors_raw[user]) | |
# In[40]: | |
for i, user in enumerate(users): | |
# Janomeの設定 | |
char_filters = [UnicodeNormalizeCharFilter() | |
, RegexReplaceCharFilter(u'[()()*/:゚∀.&;|%д@_○!,?・#@٩( )و]', u'') | |
, RegexReplaceCharFilter(u"http[:\/A-Za-z0-9\n]*", u"")] | |
#token_filters = [CompoundNounFilter(), | |
# POSStopFilter(['動詞','記号', '助詞', '助動詞','接頭詞','数','フィラー']), | |
# LowerCaseFilter()] | |
tokenizer = Jumanpp() | |
token_filters = [CompoundNounFilterForJumanpp(), | |
POSStopFilterForJumanpp(['動詞','記号', '助詞', '助動詞','接頭詞','接尾詞','特殊'])] | |
analyzer = AnalyzerForJumanpp(char_filters, tokenizer, token_filters) | |
print(str(i) + ': ' + user) | |
user_vectors[user] = get_word_count(analyzer, user_vectors_raw[user], word_list) | |
# In[41]: | |
max_len = max([len(user_vectors[key]) for key in user_vectors.keys()]) | |
for key in user_vectors.keys(): | |
user_len = len(user_vectors[key]) | |
user_vectors[key].extend([0]*(max_len - user_len)) | |
user_list=[] | |
vectors = [] | |
labels = [] | |
print(engineer) | |
print(len(engineer)) | |
print(not_engineer) | |
print(len(not_engineer)) | |
# Not Engineerなら0, Engineerなら1ってことで。 | |
for key in user_vectors.keys(): | |
user_list.append(key) | |
if key in engineer: | |
labels.append(1) | |
elif key in not_engineer: | |
labels.append(0) | |
vectors.append(user_vectors[key]) | |
print(labels) | |
print(len(vectors)) | |
# In[42]: | |
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=3, min_samples_leaf=2) | |
clf = clf.fit(vectors, labels) | |
predicted = clf.predict(vectors) | |
print(predicted) | |
print(sum(predicted == labels) / len(labels)) | |
# In[43]: | |
dot_data = StringIO() | |
tree.export_graphviz(clf, out_file=dot_data, feature_names = word_list, | |
class_names = ['not engineer', 'engineer'], | |
filled=True, rounded=True) | |
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) | |
Image(graph.create_png()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment