Skip to content

Instantly share code, notes, and snippets.

@matsuken92
Last active January 17, 2020 12:08
Show Gist options
  • Save matsuken92/72a0da8d9b42bed28e61 to your computer and use it in GitHub Desktop.
Save matsuken92/72a0da8d9b42bed28e61 to your computer and use it in GitHub Desktop.
Tweet analysis
from requests_oauthlib import OAuth1Session
from requests.exceptions import ConnectionError, ReadTimeout, SSLError
import json, datetime, time, pytz, re, sys, traceback, unicodedata, pymongo
#from pymongo import Connection # Connection classは廃止されたのでMongoClientに変更
from pymongo import MongoClient
import numpy as np
from collections import defaultdict
from bson.objectid import ObjectId
import MeCab as mc
KEYS = { # 自分のアカウントで入手したキーを下記に記載
'consumer_key':'**********',
'consumer_secret':'**********',
'access_token':'**********',
'access_secret''**********',
}
twitter = None
connect = None
db = None
tweetdata = None
meta = None
posi_nega_dict = None
def initialize(): # twitter接続情報や、mongoDBへの接続処理等initial処理実行
global twitter, twitter, connect, db, tweetdata, meta
twitter = OAuth1Session(KEYS['consumer_key'],KEYS['consumer_secret'],
KEYS['access_token'],KEYS['access_secret'])
# connect = Connection('localhost', 27017) # Connection classは廃止されたのでMongoClientに変更
connect = MongoClient('localhost', 27017)
db = connect.starbucks
tweetdata = db.tweetdata
meta = db.metadata
posi_nega_dict = db.posi_nega_dict
initialize()
# 検索ワードを指定して100件のTweetデータをTwitter REST APIsから取得する
def getTweetData(search_word, max_id, since_id):
global twitter
url = 'https://api.twitter.com/1.1/search/tweets.json'
params = {'q': search_word,
'count':'100',
}
# max_idの指定があれば設定する
if max_id != -1:
params['max_id'] = max_id
# since_idの指定があれば設定する
if since_id != -1:
params['since_id'] = since_id
req = twitter.get(url, params = params) # Tweetデータの取得
# 取得したデータの分解
if req.status_code == 200: # 成功した場合
timeline = json.loads(req.text)
metadata = timeline['search_metadata']
statuses = timeline['statuses']
limit = req.headers['x-rate-limit-remaining'] if 'x-rate-limit-remaining' in req.headers else 0
reset = req.headers['x-rate-limit-reset'] if 'x-rate-limit-reset' in req.headers else 0
return {"result":True, "metadata":metadata, "statuses":statuses, "limit":limit, "reset_time":datetime.datetime.fromtimestamp(float(reset)), "reset_time_unix":reset}
else: # 失敗した場合
print ("Error: %d" % req.status_code)
return{"result":False, "status_code":req.status_code}
def obj_nullcheck(string): # Y if X else Z
return False if string is None else True
def is_exist_id(id_str):
return tweetdata.find({'id':long(id_str)},{'id':1}).count() > 0
# 文字列を日本時間2タイムゾーンを合わせた日付型で返す
def str_to_date_jp(str_date):
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y')
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo'))
# 現在時刻をUNIX Timeで返す
def now_unix_time():
return time.mktime(datetime.datetime.now().timetuple())
# 日付の文字列をDatetime型で返す
def str_to_date_jp(str_date):
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y')
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo'))
# UTCの日付文字列を日本時間にしてDatetime型で返す
def utc_str_to_jp_str(str_date):
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y')
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')).strftime("%Y/%m/%d %H:%M:%S")
def str_to_date(str_date):
dts = datetime.datetime.strptime(str_date,'%Y-%m-%d %H:%M:%S')
return pytz.utc.localize(dts)
def str_to_date_jp_utc(str_date):
return datetime.datetime.strptime(str_date,'%Y-%m-%d %H:%M:%S') - datetime.timedelta(hours=9)
def date_to_Japan_time(dts):
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo'))
def date_to_Japan_time_str(dts):
return pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo')).strftime("%Y/%m/%d %H:%M:%S")
def date_to_str(dt):
return dt.strftime("%Y/%m/%d %H:%M:%S")
def str_to_unix_date_jp(str_date):
dts = datetime.datetime.strptime(str_date,'%a %b %d %H:%M:%S +0000 %Y')
dt = pytz.utc.localize(dts).astimezone(pytz.timezone('Asia/Tokyo'))
return time.mktime(dt.timetuple())
def unix_time_to_datetime(int_date):
return datetime.datetime.fromtimestamp(int_date)
#-------------繰り返しTweetデータを取得する-------------#
sid=-1
mid = -1
count = 0
res = None
while(True):
try:
count = count + 1
sys.stdout.write("%d, "% count)
res = getTweetData(u'スタバ', max_id=mid, since_id=sid)
if res['result']==False:
# 失敗したら終了する
print "status_code", res['status_code']
break
if int(res['limit']) == 0: # 回数制限に達したので休憩
# 日付型の列'created_datetime'を付加する
print "Adding created_at field."
for d in tweetdata.find({'created_datetime':{ "$exists": False }},{'_id':1, 'created_at':1}):
#print str_to_date_jp(d['created_at'])
tweetdata.update({'_id' : d['_id']},
{'$set' : {'created_datetime' : str_to_date_jp(d['created_at'])}})
#remove_duplicates()
# 待ち時間の計算. リミット+5秒後に再開する
diff_sec = int(res['reset_time_unix']) - now_unix_time()
print "sleep %d sec." % (diff_sec+5)
if diff_sec > 0:
time.sleep(diff_sec + 5)
else:
# metadata処理
if len(res['statuses'])==0:
sys.stdout.write("statuses is none. ")
elif 'next_results' in res['metadata']:
# 結果をmongoDBに格納する
meta.insert({"metadata":res['metadata'], "insert_date": now_unix_time()})
for s in res['statuses']:
tweetdata.insert(s)
next_url = res['metadata']['next_results']
pattern = r".*max_id=([0-9]*)\&.*"
ite = re.finditer(pattern, next_url)
for i in ite:
mid = i.group(1)
break
else:
sys.stdout.write("next is none. finished.")
break
except SSLError as (errno, request):
print "SSLError({0}): {1}".format(errno, strerror)
print "waiting 5mins"
time.sleep(5*60)
except ConnectionError as (errno, request):
print "ConnectionError({0}): {1}".format(errno, strerror)
print "waiting 5mins"
time.sleep(5*60)
except ReadTimeout as (errno, request):
print "ReadTimeout({0}): {1}".format(errno, strerror)
print "waiting 5mins"
time.sleep(5*60)
except:
print "Unexpected error:", sys.exc_info()[0]
traceback.format_exc(sys.exc_info()[2])
raise
finally:
info = sys.exc_info()
# --- insert Date type information --- #
# 文字列型の日付に加えてDatetime型のTweet日の属性追加
for d in tweetdata.find({'created_datetime':{ "$exists": False }},{'_id':1, 'created_at':1}):
#print str_to_date_jp(d['created_at'])
tweetdata.update({'_id' : d['_id']},
{'$set' : {'created_datetime' : str_to_date_jp(d['created_at'])}})
# Duration & Tweet Frequency check --- #
date_list = []
for d in tweetdata.find({},{'created_at':1}):
date_list.append(str_to_unix_date_jp(d['created_at']))
sorted_list = np.sort(date_list)
unix_time_to_datetime(sorted_list[0])
print unix_time_to_datetime(sorted_list[0])
print unix_time_to_datetime(sorted_list[len(sorted_list)-1])
print (sorted_list[len(sorted_list)-1] - sorted_list[0])/float(len(sorted_list)),"tweet/sec"
# 位置情報を持っているツイートの割合
num_not_geo = tweetdata.find({'coordinates':None},{'_id':1, 'coordinates':1}).count()
num_geo = tweetdata.find({'coordinates':{"$ne":None}},{'_id':1, 'coordinates':1}).count()
print "num_not_geo",num_not_geo
print "num_geo", num_geo
print "%.3f"%(num_geo / float(num_geo+num_not_geo) * 100),"%"
# 緯度経度表示
for d in tweetdata.find({'coordinates':{"$ne":None}},{'_id':1, 'coordinates':1}):
co = d['coordinates']['coordinates']
print co[1], co[0]
# 1時間の間にlimitに指定した数以上にリツイートされたアカウントを表示する
def select_outlier_retweet_num_per_hour(from_str_datetime_jp, limit=120):
'''
from_str_datetime_jp: 1時間枠の開始時刻
limit: この数を超えてリツイートされたものを検出する
e.g. select_outlier_tweet_num_per_hour("2015-03-18 22:00:00")
'''
result_list = []
from_date = str_to_date_jp_utc(from_str_datetime_jp)
to_date = str_to_date_jp_utc(from_str_datetime_jp) + datetime.timedelta(hours=1)
for d in tweetdata.find({'retweeted_status':{"$ne": None},'created_datetime':{"$gte":from_date, "$lt":to_date}},\
{'user':1,'text':1,'entities':1, 'created_at':1, 'id':1}):
mensioned_username = ""
if len(d['entities']['user_mentions'])!=0:
mensioned_username = d['entities']['user_mentions'][0]['screen_name']
result_list.append({"created_at":utc_str_to_jp_str(d['created_at']),\
"screen_name":d['user']['screen_name'],\
"referred_name":mensioned_username,\
"text":d['text'].replace('\n',' ')\
})
name_dict = defaultdict(int)
for r in result_list:
name_dict[r['referred_name']] += 1
s = sorted(name_dict.iteritems(),key=lambda (k,v): v,reverse=True) # リツイート回数でソート
return s[0:int(np.sum(map(lambda (k,v): 1 if v>limit else 0 ,s)))] # リツイート元ユーザー名, リツイート回数(limitを超えたもの)
start_date = str_to_date_jp_utc("2015-03-10 19:00:00")
to_date = str_to_date_jp_utc("2015-03-22 22:00:00")
d_diff = (to_date - start_date)
d_hours = d_diff.days * 24 + d_diff.seconds/float(3600)
for i in range(int(d_hours)):
d = (start_date + datetime.timedelta(hours=i)).strftime("%Y-%m-%d %H:%M:%S")
result = select_outlier_retweet_num_per_hour(d, limit=540)
if len(result) > 0:
print d, result
# spamアカウントのツイートにspamフラグを付与する
# 08_spam_detector.pyであぶり出したスパムアカウントのリスト
spam_list = ['**********', '**********', '**********', '**********', '**********']
count = 0
retweeted_name = ""
for d in tweetdata.find({'retweeted_status':{"$ne": None}}):
try:
retweeted_name = d['entities']['user_mentions'][0]['screen_name']
except:
count += 1
pattern = r".*@([0-9a-zA-Z_]*).*"
ite = re.finditer(pattern, d['text'])
for it in ite:
retweeted_name = it.group(1)
break
if retweeted_name in spam_list:
# スパムアカウントへのリツイートにspamフラグを付与
tweetdata.update({'_id' : d['_id']},{'$set': {'spam':True}})
# スパムツイートをしたアカウントもブラックリスト入り
spam_twitter.add(d['user']['screen_name'])
print '%d件のリツイートをスパムに分類しました'%count
# ブラックリスト入りのユーザーのツイートをスパムに分類
count = 0
for d in tweetdata.find({},{'user.screen_name':1}):
sc_name = d['user']['screen_name']
if sc_name in spam_twitter:
count += 1
tweetdata.update({'_id' : d['_id']},{'$set': {'spam':True}})
print "%d件のツイートをスパムに分類しました"
# 時系列ツイート数データの表示
date_dict = defaultdict(int)
ret_date_dict = defaultdict(int)
norm_date_dict = defaultdict(int)
spam_dict = defaultdict(int)
not_spam_norm_dict = defaultdict(int)
not_spam_ret_dict = defaultdict(int)
for d in tweetdata.find({},{'_id':1, 'created_datetime':1,'retweeted_status':1,'spam':1}):
str_date = date_to_Japan_time(d['created_datetime']).strftime('%Y\t%m/%d %H %a')
date_dict[str_date] += 1
# spamの除去
if ('spam' in d) and (d['spam'] == True):
spam_dict[str_date] += 1
else:
spam_dict[str_date] += 0
# spamでないもののRetweet数のカウント
if 'retweeted_status' not in d:
not_spam_ret_dict[str_date] += 0
not_spam_norm_dict[str_date] += 1
elif obj_nullcheck(d['retweeted_status']):
not_spam_ret_dict[str_date] += 1
not_spam_norm_dict[str_date] += 0
else:
not_spam_ret_dict[str_date] += 0
not_spam_norm_dict[str_date] += 1
# Retweet数のカウント
if 'retweeted_status' not in d:
ret_date_dict[str_date] += 0
norm_date_dict[str_date] += 1
elif obj_nullcheck(d['retweeted_status']):
ret_date_dict[str_date] += 1
norm_date_dict[str_date] += 0
else:
ret_date_dict[str_date] += 0
norm_date_dict[str_date] += 1
print "日付" + "\t\t\t" + "#ALL" + "\t" + "#NotRT" + "\t" + "#RT" + "\t" "#spam" + "\t" "#NotRT(exclude spam)" + "\t" + "#RT(exclude spam)"
keys = date_dict.keys()
keys.sort()
for k in keys:
print k + "\t" + str(date_dict[k]) + "\t" + str(norm_date_dict[k]) + "\t" + str(ret_date_dict[k]) \
+ "\t" + str(spam_dict[k])+ "\t" + str(not_spam_norm_dict[k]) + "\t" + str(not_spam_ret_dict[k])
# Tweet本文をMecabにかけて形態要素に分解
# Tweetデータに品詞ごとの属性noun, verb, adjective, adverbとして追加する。
# mecab 形態要素分解
def mecab_analysis(sentence):
t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/')
#sentence = u"今日は良い天気ですが、雨ですね。クルマがほしいです。走ります。"
sentence = sentence.replace('\n', ' ')
text = sentence.encode('utf-8')
node = t.parseToNode(text)
result_dict = defaultdict(list)
for i in range(140):
if node.surface != "": # ヘッダとフッタを除外
word_type = node.feature.split(",")[0]
if word_type in ["名詞", "形容詞", "動詞"]:
plain_word = node.feature.split(",")[6]
if plain_word !="*":
result_dict[word_type.decode('utf-8')].append(plain_word.decode('utf-8'))
# 地域名称を独立のFieldとして格納
if (node.feature.split(",")[1] == "固有名詞") and (node.feature.split(",")[2] == "地域"):
plain_word = node.feature.split(",")[6]
if plain_word !="*":
result_dict[u'地域名称'].append(plain_word.decode('utf-8'))
node = node.next
if node is None:
break
return result_dict
for d in tweetdata.find({'mecabed':False},{'_id':1, 'id':1, 'text':1,'noun':1,'verb':1,'adjective':1,'adverb':1}):
res = mecab_analysis(unicodedata.normalize('NFKC', d['text'])) # 半角カナを全角カナに
for k in res.keys():
if k == u'形容詞': # adjective
adjective_list = []
for w in res[k]:
adjective_list.append(w)
tweetdata.update({'_id' : d['_id']},{'$push': {'adjective':{'$each':adjective_list}}})
elif k == u'動詞': # verb
verb_list = []
for w in res[k]:
#print k, w
verb_list.append(w)
tweetdata.update({'_id' : d['_id']},{'$push': {'verb':{'$each':verb_list}}})
elif k == u'名詞': # noun
noun_list = []
for w in res[k]:
noun_list.append(w)
tweetdata.update({'_id' : d['_id']},{'$push': {'noun':{'$each':noun_list}}})
elif k == u'副詞': # adverb
adverb_list = []
for w in res[k]:
adverb_list.append(w)
tweetdata.update({'_id' : d['_id']},{'$push': {'adverb':{'$each':adverb_list}}})
# 形態要素分解済みとしてMecabedフラグの追加
tweetdata.update({'_id' : d['_id']},{'$set': {'mecabed':True}})
# Tweetに含まれる単語のポジネガによる感情度の付与
# 形態要素化できた単語のうち、ポジに分類されるものを1、ネガに分類されるものを-1
# 形態要素化できた単語数で割ることで平均値を取り指標化
pn_data = [data for data in posi_nega_dict.find({},{'word':1,'value':1})]
def get_emotion_value(word):
ret_val = None
for d in pn_data:
if d['word'] == word:
ret_val = d['value']
break
return ret_val
def isexist_and_get_data(data, key):
return data[key] if key in data else None
data = [d for d in tweetdata.find({'emo_val':{ "$exists": True }},{'noun':1,'adjective':1,'verb':1,'adverb':1})]
tweet_list = []
counter=0
for d in data:
counter += 1
if counter % 1000 == 0:
print counter
print datetime.datetime.today()
score = 0
word_count = 0
for k in ['noun','adjective','verb','adverb']:
if type(isexist_and_get_data(d,k))==list:
for i in d[k]:
v = get_emotion_value(i)
if v is not None:
score += v
word_count += 1
else:
v = get_emotion_value(isexist_and_get_data(d,k))
if v is not None:
score += v
word_count += 1
d['score'] = score/float(word_count) if word_count != 0 else 0
tweetdata.update({'_id' : d['_id']},{'$set': {'emo_val':d['score']}},True)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# 3つの文章が対象
data=["This is a pen.",
"This is also a pen. Pen is useful.",
"These are pencils.",
]
c_vec = CountVectorizer() # CountVectorizerオブジェクトの生成
c_vec.fit(data) # 対象ツイート全体の単語の集合をセットする
c_terms = c_vec.get_feature_names() # ベクトル変換後の各成分に対応する単語をベクトル表示
c_tran = c_vec.transform([data[1]]) # 2つ目の文章の数を数える
print c_terms
print data[1]
print c_tran.toarray()
# mecabで分解した単語を連結して文字列化する。
def get_mecabed_strings(from_date_str=None, to_date_str=None,include_rt=False):
tweet_list = []
tweet_texts = []
from_date = str_to_date_jp_utc(from_date_str)
to_date = str_to_date_jp_utc(to_date_str)
# 取得対象期間の条件設定
if (from_date_str is not None) and (to_date_str is not None):
query = {'created_datetime':{"$gte":from_date, "$lt":to_date}}
elif (from_date_str is not None) and (to_date_str is None):
query = {'created_datetime':{"$gte":from_date}}
elif (from_date_str is None) and (to_date_str is not None):
query = {'created_datetime':{"$lt":to_date}}
else:
query = {}
# spam除去
query['spam'] = None
# リツイートを含むか否か
if include_rt == False:
query['retweeted_status'] = None
else:
query['retweeted_status'] = {"$ne": None}
# 指定した条件のツイートを取得
for d in tweetdata.find(query,{'noun':1, 'verb':1, 'adjective':1, 'adverb':1,'text':1}):
tweet = ""
# Mecabで分割済みの単語をのリストを作成
if 'noun' in d:
for word in d['noun']:
tweet += word + " "
if 'verb' in d:
for word in d['verb']:
tweet += word + " "
if 'adjective' in d:
for word in d['adjective']:
tweet += word + " "
if 'adverb' in d:
for word in d['adverb']:
tweet += word + " "
tweet_list.append(tweet)
tweet_texts.append(d['text'])
return {"tweet_list":tweet_list,"tweet_texts":tweet_texts}
# "2015-03-18 00:00:00"以前
ret_before = get_mecabed_strings(to_date_str="2015-03-18 00:00:00")
tw_list_before = ret_before['tweet_list']
# "2015-03-18 00:00:00"以降
ret_after = get_mecabed_strings(from_date_str="2015-03-18 00:00:00")
tw_list_after= ret_after['tweet_list']
# 全期間
ret_all = get_mecabed_strings()
tw_list_all = ret_all['tweet_list']
c_vec = CountVectorizer(stop_words=[u"スタバ"]) # 「スタバ」は全Tweetに含まれるので除外
c_vec.fit(tw_list_all) # 全Tweetに含まれる単語の集合をここでセット
c_terms = c_vec.get_feature_names() # 各ベクトル要素に対応する単語を表すベクトル
# 期間の前後でひとまとまりと考え、transformする
transformed = c_vec.transform([' '.join(tw_list_before),' '.join(tw_list_after)])
# afterからbeforeを引くことで増分をsubに代入
sub = transformed[1] - transformed[0]
# トップ50がどの位置にあるかを取り出す
arg_ind = np.argsort(sub.toarray())[0][:-50:-1]
# トップ50の表示
for i in arg_ind:
print c_vec.get_feature_names()[i]
def is_include_word_list(text, word_list,f):
for word in word_list:
if text.find(word) > -1:
return True
return False
date_dict = defaultdict(int)
word_list = [u"新作", u"アーモンドミルク",u"ハニー", u"アーモンド", u"新しい", "with", u"クランチ"]
with open('armond.txt','w') as f:
for d in tweetdata.find({'spam': None, 'retweeted_status': None},{'created_datetime':1,'text':1}):
str_date = date_to_Japan_time(d['created_datetime']).strftime('%Y\t%m/%d %H %a')
text = d['text']
if is_include_word_list(text, word_list,f):
date_dict[str_date] += 1
# マッチした対象をファイルに書き出す(for 検証用)
ret_str = str_date +' '+ text.replace('\n', ' ')+'\n'
f.write(ret_str.encode('utf-8'))
print "date_dict", len(date_dict)
print "階級数:", len(date_dict)
print "日付" + "\t\t\t" + "# of Tweet"
keys = date_dict.keys()
keys.sort()
for k in keys:
print k + "\t" + str(date_dict[k])
# 各品詞がないレコードがあるか、カウントする
print tweetdata.find({'noun':None},{}).count()
print tweetdata.find({'verb':None},{}).count()
print tweetdata.find({'adjective':None},{}).count()
print tweetdata.find({'adverb':None},{}).count()
# 各品詞がないレコードにもフィールド追加
for w_type in ['noun', 'verb', 'adjective', 'adverb']:
for d in tweetdata.find({w_type:None},{'_id':1}):
tweetdata.update({'_id' : d['_id']},{'$push': {w_type:[]}})
# 位置情報を持っているツイートの割合
num_not_geo = tweetdata.find({'coordinates':None,'spam':None},{'_id':1, 'coordinates':1}).count()
num_geo = tweetdata.find({'coordinates':{"$ne":None},'spam':None},{'_id':1, 'coordinates':1}).count()
print "num_not_geo",num_not_geo
print "num_geo", num_geo
print "%.3f"%(num_geo / float(num_geo+num_not_geo) * 100),"%"
# 位置情報
loc_data = np.array([[d['coordinates']['coordinates'][1],d['coordinates']['coordinates'][0]]\
for d in tweetdata.find({'coordinates':{"$ne":None},'spam':None},{'_id':1, 'coordinates':1})])
lat = loc_data[:,0] # 緯度
lon = loc_data[:,1] # 経度
xlim_min = [np.min(lon)*.9,120,139]
xlim_max = [np.max(lon)*1.1,150,140.5]
ylim_min = [np.min(lat)*.9,20,35.1]
ylim_max = [np.max(lat)*1.1,50,36.1]
for x1,x2,y1,y2 in zip(xlim_min,xlim_max,ylim_min,ylim_max):
plt.figure(figsize=(10,10))
plt.xlim(x1,x2)
plt.ylim(y1,y2)
plt.scatter(lon, lat, s=20, alpha=0.4, c='b')
#--------------------------------------------------
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
ar = np.arange
enlarge = [1,2,4,8,16,32]
w_list = [15000000./(i) for i in enlarge]
h_list = [9000000./(i) for i in enlarge]
xlim_min = [-142, 80, 120, 135, 139]
xlim_max = [ 192, 160, 150, 142, 141]
ylim_min = [ -45, 0, 20, 33, 35]
ylim_max = [ 75, 50, 50, 37, 36.2]
ss = [ 0.7, 0.3, 0.1, 0.03, 0.005]
for i, s in zip(ar(len(xlim_min)),ss):
m = Basemap(projection='merc',llcrnrlat=ylim_min[i] ,urcrnrlat=ylim_max[i] ,\
llcrnrlon=xlim_min[i],urcrnrlon=xlim_max[i] ,lat_ts=20, resolution='c')
plt.figure(figsize=(13,13))
m.plot(lon,lat,'ro')
m.bluemarble()
for x, y in zip(lon,lat):
m.tissot(x, y, s,100,facecolor='red',zorder=100,alpha=0.4)
plt.show()
plt.savefig('plot_map_%s.png'%(str(i)))
# 全国地名辞書・郵便番号辞書テキストのインポート
#http://www.odani.jp/dragon/ken-all.htm
#全国地名辞書・郵便番号辞書テキスト
import codecs
with codecs.open("timei-all.tsv",'r','shift_jis') as f:
loc_dict = {l.split('\t')[1]: 0 for l in f.readlines()}
print len(loc_dict)
# すべての名詞を1つのリストに集約
noun_list = []
ex = noun_list.extend
for w in [d['noun'] for d in tweetdata.find({'coordinates':None,'spam':None},{'_id':1, 'noun':1})]:
ex(w)
# 地理情報にマッチするものだけ抜き出し
def exist_place(word):
if type(word) == list:
return ""
return word if word in loc_dict else ""
print len(noun_list)
res = np.array([exist_place(word) for word in noun_list])
res2 = np.array(map(len,res))
loc_list_in_tweet = np.unique(res[res2>0])
def get_coordinate_from_location(location_name):
payload = {'appid': <ヤフーappid>', 'output':'json'} # please set your own appid.
payload['query'] = location_name # e.g u'六本木'
url = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder"
r = requests.get(url, params=payload)
if r.status_code == 200:
jdata = json.loads(r.content)
# クエリで取得してた位置情報のリストから平均を算出してそれをその緯度経度とする。
try:
ret = np.array([map(float,j['Geometry']['Coordinates'].split(',')) for j in jdata['Feature']])
except KeyError, e:
"KeyError(%s)" % str(e)
return []
return np.average(ret,axis=0)
else:
print "%d: error." % r.status_code
return []
# ツイートから抽出した地名に緯度経度を付与してmongoDBにインポート
for name in loc_list_in_tweet:
loc = get_coordinate_from_location(name)
if len(loc) > 0:
location_dict.insert({"word":name,"latitude":loc[1],"longitude":loc[0]})
# インポートしたデータを取り出してリストに詰める
w_list = [loc for loc in location_dict.find({})]m
# ひらがな、カタカナだけの場合は地名じゃない可能性が高いので削除
import re
for loc in w_list:
regex = u'^[ぁ-んァ-ン]*$'
match = re.search(regex, loc['word'], re.U)
if match:
print match.group(), loc['longitude'], loc['latitude']
location_dict.remove({"word":loc['word']})
# Mecabで本文から地域名称を抽出してfieldに設定(後から付加する)
def location_name_mecab(sentence):
t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/')
sentence = sentence.replace('\n', ' ')
text = sentence.encode('utf-8')
node = t.parseToNode(text)
result_dict = defaultdict(list)
for i in range(140):
if node.surface != "": # ヘッダとフッタを除外
if (node.feature.split(",")[1] == "固有名詞") and (node.feature.split(",")[2] == "地域"):
plain_word = node.feature.split(",")[6]
if plain_word !="*":
result_dict[u'地域名称'].append(plain_word.decode('utf-8'))
node = node.next
if node is None:
break
return result_dict
for d in tweetdata.find({'spam':None},{'_id':1, 'text':1}):
ret = location_name_mecab(d['text'])
tweetdata.update({'_id' : d['_id']},{'$push': {'location_name':{'$each':ret[u'地域名称']}}})
# Spamツイートの分離2 spamツイートをリツイートしている人をブロック
# spam accounts
spam_list = ['**********', '**********', '**********', '**********','**********',\
'**********','**********','**********']
retweeted_name = ""
spam_twitter = set()
print tweetdata.find({'retweeted_status':{"$ne": None}}).count()
for d in tweetdata.find({'retweeted_status':{"$ne": None}}):
try:
retweeted_name = d['entities']['user_mentions'][0]['screen_name']
except:
pattern = r".*@([0-9a-zA-Z_]*).*"
ite = re.finditer(pattern, d['text'])
for it in ite:
retweeted_name = it.group(1)
break
if retweeted_name in spam_list:
spam_twitter.add(d['user']['screen_name'])
for user in spam_twitter:
print user
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment