Last active
January 17, 2020 12:08
-
-
Save matsuken92/72a0da8d9b42bed28e61 to your computer and use it in GitHub Desktop.
Tweet analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests_oauthlib import OAuth1Session | |
from requests.exceptions import ConnectionError, ReadTimeout, SSLError | |
import json, datetime, time, pytz, re, sys, traceback, unicodedata, pymongo | |
#from pymongo import Connection # Connection classは廃止されたのでMongoClientに変更 | |
from pymongo import MongoClient | |
import numpy as np | |
from collections import defaultdict | |
from bson.objectid import ObjectId | |
import MeCab as mc | |
KEYS = { # 自分のアカウントで入手したキーを下記に記載 | |
'consumer_key':'**********', | |
'consumer_secret':'**********', | |
'access_token':'**********', | |
'access_secret''**********', | |
} | |
twitter = None | |
connect = None | |
db = None | |
tweetdata = None | |
meta = None | |
posi_nega_dict = None | |
def initialize(): # twitter接続情報や、mongoDBへの接続処理等initial処理実行 | |
global twitter, twitter, connect, db, tweetdata, meta | |
twitter = OAuth1Session(KEYS['consumer_key'],KEYS['consumer_secret'], | |
KEYS['access_token'],KEYS['access_secret']) | |
# connect = Connection('localhost', 27017) # Connection classは廃止されたのでMongoClientに変更 | |
connect = MongoClient('localhost', 27017) | |
db = connect.starbucks | |
tweetdata = db.tweetdata | |
meta = db.metadata | |
posi_nega_dict = db.posi_nega_dict | |
initialize() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 検索ワードを指定して100件のTweetデータをTwitter REST APIsから取得する | |
def getTweetData(search_word, max_id, since_id): | |
global twitter | |
url = 'https://api.twitter.com/1.1/search/tweets.json' | |
params = {'q': search_word, | |
'count':'100', | |
} | |
# max_idの指定があれば設定する | |
if max_id != -1: | |
params['max_id'] = max_id | |
# since_idの指定があれば設定する | |
if since_id != -1: | |
params['since_id'] = since_id | |
req = twitter.get(url, params = params) # Tweetデータの取得 | |
# 取得したデータの分解 | |
if req.status_code == 200: # 成功した場合 | |
timeline = json.loads(req.text) | |
metadata = timeline['search_metadata'] | |
statuses = timeline['statuses'] | |
limit = req.headers['x-rate-limit-remaining'] if 'x-rate-limit-remaining' in req.headers else 0 | |
reset = req.headers['x-rate-limit-reset'] if 'x-rate-limit-reset' in req.headers else 0 | |
return {"result":True, "metadata":metadata, "statuses":statuses, "limit":limit, "reset_time":datetime.datetime.fromtimestamp(float(reset)), "reset_time_unix":reset} | |
else: # 失敗した場合 | |
print ("Error: %d" % req.status_code) | |
return{"result":False, "status_code":req.status_code} | |
def obj_nullcheck(string): # Y if X else Z | |
return False if string is None else True | |
def is_exist_id(id_str): | |
return tweetdata.find({'id':long(id_str)},{'id':1}).count() > 0 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 文字列を日本時間2タイムゾーンを合わせた日付型で返す | |
def str_to_date_jp(str_date): | |
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y') | |
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')) | |
# 現在時刻をUNIX Timeで返す | |
def now_unix_time(): | |
return time.mktime(datetime.datetime.now().timetuple()) | |
# 日付の文字列をDatetime型で返す | |
def str_to_date_jp(str_date): | |
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y') | |
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')) | |
# UTCの日付文字列を日本時間にしてDatetime型で返す | |
def utc_str_to_jp_str(str_date): | |
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y') | |
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')).strftime("%Y/%m/%d %H:%M:%S") | |
def str_to_date(str_date): | |
dts = datetime.datetime.strptime(str_date,'%Y-%m-%d %H:%M:%S') | |
return pytz.utc.localize(dts) | |
def str_to_date_jp_utc(str_date): | |
return datetime.datetime.strptime(str_date,'%Y-%m-%d %H:%M:%S') - datetime.timedelta(hours=9) | |
def date_to_Japan_time(dts): | |
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')) | |
def date_to_Japan_time_str(dts): | |
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')).strftime("%Y/%m/%d %H:%M:%S") | |
def date_to_str(dt): | |
return dt.strftime("%Y/%m/%d %H:%M:%S") | |
def str_to_unix_date_jp(str_date): | |
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y') | |
dt = pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')) | |
return time.mktime(dt.timetuple()) | |
def unix_time_to_datetime(int_date): | |
return datetime.datetime.fromtimestamp(int_date) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-------------繰り返しTweetデータを取得する-------------# | |
sid=-1 | |
mid = -1 | |
count = 0 | |
res = None | |
while(True): | |
try: | |
count = count + 1 | |
sys.stdout.write("%d, "% count) | |
res = getTweetData(u'スタバ', max_id=mid, since_id=sid) | |
if res['result']==False: | |
# 失敗したら終了する | |
print "status_code", res['status_code'] | |
break | |
if int(res['limit']) == 0: # 回数制限に達したので休憩 | |
# 日付型の列'created_datetime'を付加する | |
print "Adding created_at field." | |
for d in tweetdata.find({'created_datetime':{ "$exists": False }},{'_id':1, 'created_at':1}): | |
#print str_to_date_jp(d['created_at']) | |
tweetdata.update({'_id' : d['_id']}, | |
{'$set' : {'created_datetime' : str_to_date_jp(d['created_at'])}}) | |
#remove_duplicates() | |
# 待ち時間の計算. リミット+5秒後に再開する | |
diff_sec = int(res['reset_time_unix']) - now_unix_time() | |
print "sleep %d sec." % (diff_sec+5) | |
if diff_sec > 0: | |
time.sleep(diff_sec + 5) | |
else: | |
# metadata処理 | |
if len(res['statuses'])==0: | |
sys.stdout.write("statuses is none. ") | |
elif 'next_results' in res['metadata']: | |
# 結果をmongoDBに格納する | |
meta.insert({"metadata":res['metadata'], "insert_date": now_unix_time()}) | |
for s in res['statuses']: | |
tweetdata.insert(s) | |
next_url = res['metadata']['next_results'] | |
pattern = r".*max_id=([0-9]*)\&.*" | |
ite = re.finditer(pattern, next_url) | |
for i in ite: | |
mid = i.group(1) | |
break | |
else: | |
sys.stdout.write("next is none. finished.") | |
break | |
except SSLError as (errno, request): | |
print "SSLError({0}): {1}".format(errno, strerror) | |
print "waiting 5mins" | |
time.sleep(5*60) | |
except ConnectionError as (errno, request): | |
print "ConnectionError({0}): {1}".format(errno, strerror) | |
print "waiting 5mins" | |
time.sleep(5*60) | |
except ReadTimeout as (errno, request): | |
print "ReadTimeout({0}): {1}".format(errno, strerror) | |
print "waiting 5mins" | |
time.sleep(5*60) | |
except: | |
print "Unexpected error:", sys.exc_info()[0] | |
traceback.format_exc(sys.exc_info()[2]) | |
raise | |
finally: | |
info = sys.exc_info() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --- insert Date type information --- # | |
# 文字列型の日付に加えてDatetime型のTweet日の属性追加 | |
for d in tweetdata.find({'created_datetime':{ "$exists": False }},{'_id':1, 'created_at':1}): | |
#print str_to_date_jp(d['created_at']) | |
tweetdata.update({'_id' : d['_id']}, | |
{'$set' : {'created_datetime' : str_to_date_jp(d['created_at'])}}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Duration & Tweet Frequency check --- # | |
date_list = [] | |
for d in tweetdata.find({},{'created_at':1}): | |
date_list.append(str_to_unix_date_jp(d['created_at'])) | |
sorted_list = np.sort(date_list) | |
unix_time_to_datetime(sorted_list[0]) | |
print unix_time_to_datetime(sorted_list[0]) | |
print unix_time_to_datetime(sorted_list[len(sorted_list)-1]) | |
print (sorted_list[len(sorted_list)-1] - sorted_list[0])/float(len(sorted_list)),"tweet/sec" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 位置情報を持っているツイートの割合 | |
num_not_geo = tweetdata.find({'coordinates':None},{'_id':1, 'coordinates':1}).count() | |
num_geo = tweetdata.find({'coordinates':{"$ne":None}},{'_id':1, 'coordinates':1}).count() | |
print "num_not_geo",num_not_geo | |
print "num_geo", num_geo | |
print "%.3f"%(num_geo / float(num_geo+num_not_geo) * 100),"%" | |
# 緯度経度表示 | |
for d in tweetdata.find({'coordinates':{"$ne":None}},{'_id':1, 'coordinates':1}): | |
co = d['coordinates']['coordinates'] | |
print co[1], co[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1時間の間にlimitに指定した数以上にリツイートされたアカウントを表示する | |
def select_outlier_retweet_num_per_hour(from_str_datetime_jp, limit=120): | |
''' | |
from_str_datetime_jp: 1時間枠の開始時刻 | |
limit: この数を超えてリツイートされたものを検出する | |
e.g. select_outlier_tweet_num_per_hour("2015-03-18 22:00:00") | |
''' | |
result_list = [] | |
from_date = str_to_date_jp_utc(from_str_datetime_jp) | |
to_date = str_to_date_jp_utc(from_str_datetime_jp) + datetime.timedelta(hours=1) | |
for d in tweetdata.find({'retweeted_status':{"$ne": None},'created_datetime':{"$gte":from_date, "$lt":to_date}},\ | |
{'user':1,'text':1,'entities':1, 'created_at':1, 'id':1}): | |
mensioned_username = "" | |
if len(d['entities']['user_mentions'])!=0: | |
mensioned_username = d['entities']['user_mentions'][0]['screen_name'] | |
result_list.append({"created_at":utc_str_to_jp_str(d['created_at']),\ | |
"screen_name":d['user']['screen_name'],\ | |
"referred_name":mensioned_username,\ | |
"text":d['text'].replace('\n',' ')\ | |
}) | |
name_dict = defaultdict(int) | |
for r in result_list: | |
name_dict[r['referred_name']] += 1 | |
s = sorted(name_dict.iteritems(),key=lambda (k,v): v,reverse=True) # リツイート回数でソート | |
return s[0:int(np.sum(map(lambda (k,v): 1 if v>limit else 0 ,s)))] # リツイート元ユーザー名, リツイート回数(limitを超えたもの) | |
start_date = str_to_date_jp_utc("2015-03-10 19:00:00") | |
to_date = str_to_date_jp_utc("2015-03-22 22:00:00") | |
d_diff = (to_date - start_date) | |
d_hours = d_diff.days * 24 + d_diff.seconds/float(3600) | |
for i in range(int(d_hours)): | |
d = (start_date + datetime.timedelta(hours=i)).strftime("%Y-%m-%d %H:%M:%S") | |
result = select_outlier_retweet_num_per_hour(d, limit=540) | |
if len(result) > 0: | |
print d, result | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# spamアカウントのツイートにspamフラグを付与する | |
# 08_spam_detector.pyであぶり出したスパムアカウントのリスト | |
spam_list = ['**********', '**********', '**********', '**********', '**********'] | |
count = 0 | |
retweeted_name = "" | |
for d in tweetdata.find({'retweeted_status':{"$ne": None}}): | |
try: | |
retweeted_name = d['entities']['user_mentions'][0]['screen_name'] | |
except: | |
count += 1 | |
pattern = r".*@([0-9a-zA-Z_]*).*" | |
ite = re.finditer(pattern, d['text']) | |
for it in ite: | |
retweeted_name = it.group(1) | |
break | |
if retweeted_name in spam_list: | |
# スパムアカウントへのリツイートにspamフラグを付与 | |
tweetdata.update({'_id' : d['_id']},{'$set': {'spam':True}}) | |
# スパムツイートをしたアカウントもブラックリスト入り | |
spam_twitter.add(d['user']['screen_name']) | |
print '%d件のリツイートをスパムに分類しました'%count | |
# ブラックリスト入りのユーザーのツイートをスパムに分類 | |
count = 0 | |
for d in tweetdata.find({},{'user.screen_name':1}): | |
sc_name = d['user']['screen_name'] | |
if sc_name in spam_twitter: | |
count += 1 | |
tweetdata.update({'_id' : d['_id']},{'$set': {'spam':True}}) | |
print "%d件のツイートをスパムに分類しました" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 時系列ツイート数データの表示 | |
date_dict = defaultdict(int) | |
ret_date_dict = defaultdict(int) | |
norm_date_dict = defaultdict(int) | |
spam_dict = defaultdict(int) | |
not_spam_norm_dict = defaultdict(int) | |
not_spam_ret_dict = defaultdict(int) | |
for d in tweetdata.find({},{'_id':1, 'created_datetime':1,'retweeted_status':1,'spam':1}): | |
str_date = date_to_Japan_time(d['created_datetime']).strftime('%Y\t%m/%d %H %a') | |
date_dict[str_date] += 1 | |
# spamの除去 | |
if ('spam' in d) and (d['spam'] == True): | |
spam_dict[str_date] += 1 | |
else: | |
spam_dict[str_date] += 0 | |
# spamでないもののRetweet数のカウント | |
if 'retweeted_status' not in d: | |
not_spam_ret_dict[str_date] += 0 | |
not_spam_norm_dict[str_date] += 1 | |
elif obj_nullcheck(d['retweeted_status']): | |
not_spam_ret_dict[str_date] += 1 | |
not_spam_norm_dict[str_date] += 0 | |
else: | |
not_spam_ret_dict[str_date] += 0 | |
not_spam_norm_dict[str_date] += 1 | |
# Retweet数のカウント | |
if 'retweeted_status' not in d: | |
ret_date_dict[str_date] += 0 | |
norm_date_dict[str_date] += 1 | |
elif obj_nullcheck(d['retweeted_status']): | |
ret_date_dict[str_date] += 1 | |
norm_date_dict[str_date] += 0 | |
else: | |
ret_date_dict[str_date] += 0 | |
norm_date_dict[str_date] += 1 | |
print "日付" + "\t\t\t" + "#ALL" + "\t" + "#NotRT" + "\t" + "#RT" + "\t" "#spam" + "\t" "#NotRT(exclude spam)" + "\t" + "#RT(exclude spam)" | |
keys = date_dict.keys() | |
keys.sort() | |
for k in keys: | |
print k + "\t" + str(date_dict[k]) + "\t" + str(norm_date_dict[k]) + "\t" + str(ret_date_dict[k]) \ | |
+ "\t" + str(spam_dict[k])+ "\t" + str(not_spam_norm_dict[k]) + "\t" + str(not_spam_ret_dict[k]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# now making... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tweet本文をMecabにかけて形態要素に分解 | |
# Tweetデータに品詞ごとの属性noun, verb, adjective, adverbとして追加する。 | |
# mecab 形態要素分解 | |
def mecab_analysis(sentence): | |
t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/') | |
#sentence = u"今日は良い天気ですが、雨ですね。クルマがほしいです。走ります。" | |
sentence = sentence.replace('\n', ' ') | |
text = sentence.encode('utf-8') | |
node = t.parseToNode(text) | |
result_dict = defaultdict(list) | |
for i in range(140): | |
if node.surface != "": # ヘッダとフッタを除外 | |
word_type = node.feature.split(",")[0] | |
if word_type in ["名詞", "形容詞", "動詞"]: | |
plain_word = node.feature.split(",")[6] | |
if plain_word !="*": | |
result_dict[word_type.decode('utf-8')].append(plain_word.decode('utf-8')) | |
# 地域名称を独立のFieldとして格納 | |
if (node.feature.split(",")[1] == "固有名詞") and (node.feature.split(",")[2] == "地域"): | |
plain_word = node.feature.split(",")[6] | |
if plain_word !="*": | |
result_dict[u'地域名称'].append(plain_word.decode('utf-8')) | |
node = node.next | |
if node is None: | |
break | |
return result_dict | |
for d in tweetdata.find({'mecabed':False},{'_id':1, 'id':1, 'text':1,'noun':1,'verb':1,'adjective':1,'adverb':1}): | |
res = mecab_analysis(unicodedata.normalize('NFKC', d['text'])) # 半角カナを全角カナに | |
for k in res.keys(): | |
if k == u'形容詞': # adjective | |
adjective_list = [] | |
for w in res[k]: | |
adjective_list.append(w) | |
tweetdata.update({'_id' : d['_id']},{'$push': {'adjective':{'$each':adjective_list}}}) | |
elif k == u'動詞': # verb | |
verb_list = [] | |
for w in res[k]: | |
#print k, w | |
verb_list.append(w) | |
tweetdata.update({'_id' : d['_id']},{'$push': {'verb':{'$each':verb_list}}}) | |
elif k == u'名詞': # noun | |
noun_list = [] | |
for w in res[k]: | |
noun_list.append(w) | |
tweetdata.update({'_id' : d['_id']},{'$push': {'noun':{'$each':noun_list}}}) | |
elif k == u'副詞': # adverb | |
adverb_list = [] | |
for w in res[k]: | |
adverb_list.append(w) | |
tweetdata.update({'_id' : d['_id']},{'$push': {'adverb':{'$each':adverb_list}}}) | |
# 形態要素分解済みとしてMecabedフラグの追加 | |
tweetdata.update({'_id' : d['_id']},{'$set': {'mecabed':True}}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tweetに含まれる単語のポジネガによる感情度の付与 | |
# 形態要素化できた単語のうち、ポジに分類されるものを1、ネガに分類されるものを-1 | |
# 形態要素化できた単語数で割ることで平均値を取り指標化 | |
pn_data = [data for data in posi_nega_dict.find({},{'word':1,'value':1})] | |
def get_emotion_value(word): | |
ret_val = None | |
for d in pn_data: | |
if d['word'] == word: | |
ret_val = d['value'] | |
break | |
return ret_val | |
def isexist_and_get_data(data, key): | |
return data[key] if key in data else None | |
data = [d for d in tweetdata.find({'emo_val':{ "$exists": True }},{'noun':1,'adjective':1,'verb':1,'adverb':1})] | |
tweet_list = [] | |
counter=0 | |
for d in data: | |
counter += 1 | |
if counter % 1000 == 0: | |
print counter | |
print datetime.datetime.today() | |
score = 0 | |
word_count = 0 | |
for k in ['noun','adjective','verb','adverb']: | |
if type(isexist_and_get_data(d,k))==list: | |
for i in d[k]: | |
v = get_emotion_value(i) | |
if v is not None: | |
score += v | |
word_count += 1 | |
else: | |
v = get_emotion_value(isexist_and_get_data(d,k)) | |
if v is not None: | |
score += v | |
word_count += 1 | |
d['score'] = score/float(word_count) if word_count != 0 else 0 | |
tweetdata.update({'_id' : d['_id']},{'$set': {'emo_val':d['score']}},True) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
# 3つの文章が対象 | |
data=["This is a pen.", | |
"This is also a pen. Pen is useful.", | |
"These are pencils.", | |
] | |
c_vec = CountVectorizer() # CountVectorizerオブジェクトの生成 | |
c_vec.fit(data) # 対象ツイート全体の単語の集合をセットする | |
c_terms = c_vec.get_feature_names() # ベクトル変換後の各成分に対応する単語をベクトル表示 | |
c_tran = c_vec.transform([data[1]]) # 2つ目の文章の数を数える | |
print c_terms | |
print data[1] | |
print c_tran.toarray() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mecabで分解した単語を連結して文字列化する。 | |
def get_mecabed_strings(from_date_str=None, to_date_str=None,include_rt=False): | |
tweet_list = [] | |
tweet_texts = [] | |
from_date = str_to_date_jp_utc(from_date_str) | |
to_date = str_to_date_jp_utc(to_date_str) | |
# 取得対象期間の条件設定 | |
if (from_date_str is not None) and (to_date_str is not None): | |
query = {'created_datetime':{"$gte":from_date, "$lt":to_date}} | |
elif (from_date_str is not None) and (to_date_str is None): | |
query = {'created_datetime':{"$gte":from_date}} | |
elif (from_date_str is None) and (to_date_str is not None): | |
query = {'created_datetime':{"$lt":to_date}} | |
else: | |
query = {} | |
# spam除去 | |
query['spam'] = None | |
# リツイートを含むか否か | |
if include_rt == False: | |
query['retweeted_status'] = None | |
else: | |
query['retweeted_status'] = {"$ne": None} | |
# 指定した条件のツイートを取得 | |
for d in tweetdata.find(query,{'noun':1, 'verb':1, 'adjective':1, 'adverb':1,'text':1}): | |
tweet = "" | |
# Mecabで分割済みの単語をのリストを作成 | |
if 'noun' in d: | |
for word in d['noun']: | |
tweet += word + " " | |
if 'verb' in d: | |
for word in d['verb']: | |
tweet += word + " " | |
if 'adjective' in d: | |
for word in d['adjective']: | |
tweet += word + " " | |
if 'adverb' in d: | |
for word in d['adverb']: | |
tweet += word + " " | |
tweet_list.append(tweet) | |
tweet_texts.append(d['text']) | |
return {"tweet_list":tweet_list,"tweet_texts":tweet_texts} | |
# "2015-03-18 00:00:00"以前 | |
ret_before = get_mecabed_strings(to_date_str="2015-03-18 00:00:00") | |
tw_list_before = ret_before['tweet_list'] | |
# "2015-03-18 00:00:00"以降 | |
ret_after = get_mecabed_strings(from_date_str="2015-03-18 00:00:00") | |
tw_list_after= ret_after['tweet_list'] | |
# 全期間 | |
ret_all = get_mecabed_strings() | |
tw_list_all = ret_all['tweet_list'] | |
c_vec = CountVectorizer(stop_words=[u"スタバ"]) # 「スタバ」は全Tweetに含まれるので除外 | |
c_vec.fit(tw_list_all) # 全Tweetに含まれる単語の集合をここでセット | |
c_terms = c_vec.get_feature_names() # 各ベクトル要素に対応する単語を表すベクトル | |
# 期間の前後でひとまとまりと考え、transformする | |
transformed = c_vec.transform([' '.join(tw_list_before),' '.join(tw_list_after)]) | |
# afterからbeforeを引くことで増分をsubに代入 | |
sub = transformed[1] - transformed[0] | |
# トップ50がどの位置にあるかを取り出す | |
arg_ind = np.argsort(sub.toarray())[0][:-50:-1] | |
# トップ50の表示 | |
for i in arg_ind: | |
print c_vec.get_feature_names()[i] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def is_include_word_list(text, word_list,f): | |
for word in word_list: | |
if text.find(word) > -1: | |
return True | |
return False | |
date_dict = defaultdict(int) | |
word_list = [u"新作", u"アーモンドミルク",u"ハニー", u"アーモンド", u"新しい", "with", u"クランチ"] | |
with open('armond.txt','w') as f: | |
for d in tweetdata.find({'spam': None, 'retweeted_status': None},{'created_datetime':1,'text':1}): | |
str_date = date_to_Japan_time(d['created_datetime']).strftime('%Y\t%m/%d %H %a') | |
text = d['text'] | |
if is_include_word_list(text, word_list,f): | |
date_dict[str_date] += 1 | |
# マッチした対象をファイルに書き出す(for 検証用) | |
ret_str = str_date +' '+ text.replace('\n', ' ')+'\n' | |
f.write(ret_str.encode('utf-8')) | |
print "date_dict", len(date_dict) | |
print "階級数:", len(date_dict) | |
print "日付" + "\t\t\t" + "# of Tweet" | |
keys = date_dict.keys() | |
keys.sort() | |
for k in keys: | |
print k + "\t" + str(date_dict[k]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 各品詞がないレコードがあるか、カウントする | |
print tweetdata.find({'noun':None},{}).count() | |
print tweetdata.find({'verb':None},{}).count() | |
print tweetdata.find({'adjective':None},{}).count() | |
print tweetdata.find({'adverb':None},{}).count() | |
# 各品詞がないレコードにもフィールド追加 | |
for w_type in ['noun', 'verb', 'adjective', 'adverb']: | |
for d in tweetdata.find({w_type:None},{'_id':1}): | |
tweetdata.update({'_id' : d['_id']},{'$push': {w_type:[]}}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 位置情報を持っているツイートの割合 | |
num_not_geo = tweetdata.find({'coordinates':None,'spam':None},{'_id':1, 'coordinates':1}).count() | |
num_geo = tweetdata.find({'coordinates':{"$ne":None},'spam':None},{'_id':1, 'coordinates':1}).count() | |
print "num_not_geo",num_not_geo | |
print "num_geo", num_geo | |
print "%.3f"%(num_geo / float(num_geo+num_not_geo) * 100),"%" | |
# 位置情報 | |
loc_data = np.array([[d['coordinates']['coordinates'][1],d['coordinates']['coordinates'][0]]\ | |
for d in tweetdata.find({'coordinates':{"$ne":None},'spam':None},{'_id':1, 'coordinates':1})]) | |
lat = loc_data[:,0] # 緯度 | |
lon = loc_data[:,1] # 経度 | |
xlim_min = [np.min(lon)*.9,120,139] | |
xlim_max = [np.max(lon)*1.1,150,140.5] | |
ylim_min = [np.min(lat)*.9,20,35.1] | |
ylim_max = [np.max(lat)*1.1,50,36.1] | |
for x1,x2,y1,y2 in zip(xlim_min,xlim_max,ylim_min,ylim_max): | |
plt.figure(figsize=(10,10)) | |
plt.xlim(x1,x2) | |
plt.ylim(y1,y2) | |
plt.scatter(lon, lat, s=20, alpha=0.4, c='b') | |
#-------------------------------------------------- | |
from mpl_toolkits.basemap import Basemap | |
import matplotlib.pyplot as plt | |
ar = np.arange | |
enlarge = [1,2,4,8,16,32] | |
w_list = [15000000./(i) for i in enlarge] | |
h_list = [9000000./(i) for i in enlarge] | |
xlim_min = [-142, 80, 120, 135, 139] | |
xlim_max = [ 192, 160, 150, 142, 141] | |
ylim_min = [ -45, 0, 20, 33, 35] | |
ylim_max = [ 75, 50, 50, 37, 36.2] | |
ss = [ 0.7, 0.3, 0.1, 0.03, 0.005] | |
for i, s in zip(ar(len(xlim_min)),ss): | |
m = Basemap(projection='merc',llcrnrlat=ylim_min[i] ,urcrnrlat=ylim_max[i] ,\ | |
llcrnrlon=xlim_min[i],urcrnrlon=xlim_max[i] ,lat_ts=20, resolution='c') | |
plt.figure(figsize=(13,13)) | |
m.plot(lon,lat,'ro') | |
m.bluemarble() | |
for x, y in zip(lon,lat): | |
m.tissot(x, y, s,100,facecolor='red',zorder=100,alpha=0.4) | |
plt.show() | |
plt.savefig('plot_map_%s.png'%(str(i))) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 全国地名辞書・郵便番号辞書テキストのインポート | |
#http://www.odani.jp/dragon/ken-all.htm | |
#全国地名辞書・郵便番号辞書テキスト | |
import codecs | |
with codecs.open("timei-all.tsv",'r','shift_jis') as f: | |
loc_dict = {l.split('\t')[1]: 0 for l in f.readlines()} | |
print len(loc_dict) | |
# すべての名詞を1つのリストに集約 | |
noun_list = [] | |
ex = noun_list.extend | |
for w in [d['noun'] for d in tweetdata.find({'coordinates':None,'spam':None},{'_id':1, 'noun':1})]: | |
ex(w) | |
# 地理情報にマッチするものだけ抜き出し | |
def exist_place(word): | |
if type(word) == list: | |
return "" | |
return word if word in loc_dict else "" | |
print len(noun_list) | |
res = np.array([exist_place(word) for word in noun_list]) | |
res2 = np.array(map(len,res)) | |
loc_list_in_tweet = np.unique(res[res2>0]) | |
def get_coordinate_from_location(location_name): | |
payload = {'appid': <ヤフーappid>', 'output':'json'} # please set your own appid. | |
payload['query'] = location_name # e.g u'六本木' | |
url = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder" | |
r = requests.get(url, params=payload) | |
if r.status_code == 200: | |
jdata = json.loads(r.content) | |
# クエリで取得してた位置情報のリストから平均を算出してそれをその緯度経度とする。 | |
try: | |
ret = np.array([map(float,j['Geometry']['Coordinates'].split(',')) for j in jdata['Feature']]) | |
except KeyError, e: | |
"KeyError(%s)" % str(e) | |
return [] | |
return np.average(ret,axis=0) | |
else: | |
print "%d: error." % r.status_code | |
return [] | |
# ツイートから抽出した地名に緯度経度を付与してmongoDBにインポート | |
for name in loc_list_in_tweet: | |
loc = get_coordinate_from_location(name) | |
if len(loc) > 0: | |
location_dict.insert({"word":name,"latitude":loc[1],"longitude":loc[0]}) | |
# インポートしたデータを取り出してリストに詰める | |
w_list = [loc for loc in location_dict.find({})]m | |
# ひらがな、カタカナだけの場合は地名じゃない可能性が高いので削除 | |
import re | |
for loc in w_list: | |
regex = u'^[ぁ-んァ-ン]*$' | |
match = re.search(regex, loc['word'], re.U) | |
if match: | |
print match.group(), loc['longitude'], loc['latitude'] | |
location_dict.remove({"word":loc['word']}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Mecabで本文から地域名称を抽出してfieldに設定(後から付加する) | |
def location_name_mecab(sentence): | |
t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/') | |
sentence = sentence.replace('\n', ' ') | |
text = sentence.encode('utf-8') | |
node = t.parseToNode(text) | |
result_dict = defaultdict(list) | |
for i in range(140): | |
if node.surface != "": # ヘッダとフッタを除外 | |
if (node.feature.split(",")[1] == "固有名詞") and (node.feature.split(",")[2] == "地域"): | |
plain_word = node.feature.split(",")[6] | |
if plain_word !="*": | |
result_dict[u'地域名称'].append(plain_word.decode('utf-8')) | |
node = node.next | |
if node is None: | |
break | |
return result_dict | |
for d in tweetdata.find({'spam':None},{'_id':1, 'text':1}): | |
ret = location_name_mecab(d['text']) | |
tweetdata.update({'_id' : d['_id']},{'$push': {'location_name':{'$each':ret[u'地域名称']}}}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Spamツイートの分離2 spamツイートをリツイートしている人をブロック | |
# spam accounts | |
spam_list = ['**********', '**********', '**********', '**********','**********',\ | |
'**********','**********','**********'] | |
retweeted_name = "" | |
spam_twitter = set() | |
print tweetdata.find({'retweeted_status':{"$ne": None}}).count() | |
for d in tweetdata.find({'retweeted_status':{"$ne": None}}): | |
try: | |
retweeted_name = d['entities']['user_mentions'][0]['screen_name'] | |
except: | |
pattern = r".*@([0-9a-zA-Z_]*).*" | |
ite = re.finditer(pattern, d['text']) | |
for it in ite: | |
retweeted_name = it.group(1) | |
break | |
if retweeted_name in spam_list: | |
spam_twitter.add(d['user']['screen_name']) | |
for user in spam_twitter: | |
print user |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment