Created
June 8, 2013 23:39
-
-
Save yuiseki/5737006 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import sys | |
import traceback | |
import datetime | |
from dateutil.parser import parse | |
import pytz | |
import json | |
import tweepy | |
import pymongo | |
import MeCab | |
import re | |
keyword = "#akiba" | |
argvs = sys.argv | |
if (len(argvs) == 2): | |
keyword = argvs[1] | |
print "tracking to ", keyword | |
database = "twitter_track" | |
collection = keyword | |
conn = pymongo.Connection("localhost") | |
col = conn[database][collection] | |
m_wakati = MeCab.Tagger("-Owakati") | |
class CustomStreamListener(tweepy.StreamListener): | |
def on_data(self, data): | |
print "-----" | |
status = json.loads(data) | |
# 時刻でsort, findするために時刻型だけ変換しておく | |
at = status.get("created_at", None) | |
if at is None: | |
status["created_dt"] = datetime.datetime.now(pytz.utc) | |
else: | |
status["created_dt"] = parse(at) | |
# MeCabによる単語抽出 | |
try: | |
# 単語を抽出する前に、textからURL, hashtag, usernameを除去する。 | |
# その情報は別のフィールドから得られるし、mecabだとうまく抽出できない | |
hashtags = map(lambda x: x["text"] , status["entities"]["hashtags"]) | |
usernames = map(lambda x: x["screen_name"] , status["entities"]["user_mentions"]) | |
urls = map(lambda x: x["url"] , status["entities"]["urls"]) | |
delwords = hashtags + usernames + urls + [":", "RT", "#", "@"] | |
wakati = status["text"] | |
for delw in delwords: | |
wakati = re.sub(re.compile(delw), "", wakati) | |
status["text_wakati"] = re.split(r'\s', m_wakati.parse(wakati.encode("utf-8"))) | |
except Exception, e: | |
print traceback.format_exc() | |
col.insert(status) | |
print (status["created_dt"] + datetime.timedelta(hours=9)).strftime("%H:%M:%S"), \ | |
status["user"]["screen_name"], ":", \ | |
status["text"].replace('\n','') | |
def on_error(self, status_code): | |
print >> sys.stderr, 'Error! :', status_code | |
return False | |
def on_timeout(self): | |
print >> sys.stderr, 'Timeout...' | |
return False | |
conf = { "consumer_key": "", | |
"consumer_secret": "", | |
"access_key": "", | |
"access_secret": "" } | |
auth = tweepy.OAuthHandler(conf["consumer_key"], conf["consumer_secret"]) | |
auth.set_access_token(conf["access_key"], conf["access_secret"]) | |
try: | |
stream = tweepy.streaming.Stream(auth, CustomStreamListener()) | |
stream.filter(track=[keyword]) | |
except Exception, e: | |
print traceback.format_exc() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment