Last active
August 8, 2017 21:47
-
-
Save brendano/fa7fae9e8a3ad40500474aa89b226039 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
r""" | |
stdin: IDs of tweets to get (whitespace or line separated) | |
stdout: the tweets as two-column TSV: ID \t TweetJSON | |
This retrieves tweets using the API. | |
If there was an error when retrieving a message - most prominently, if the | |
message is now deleted -- the error information is saved as JSON. Therefore | |
there should be exactly as many output lines as there are input IDs. | |
This script tries to be robust about rate limiting, though this is not | |
extensively tested. | |
Related: Jimmy Lin's twitter-tools (formerly twitter-corpus-tools) | |
knows (or used to know) how to retrieve tweets based on scraping the HTML | |
and/or embedded JSON within the HTML from the website. | |
https://github.com/lintool/twitter-tools | |
""" | |
import tweepy,sys,json,time | |
# https://dev.twitter.com/rest/public/rate-limits | |
# seems to indicate 15*60/900 = 1 request per second is OK | |
STANDARD_PAUSE_SECONDS = 1.0 | |
COOLDOWN_PAUSE_SECONDS = 30.0 | |
# don't parse. return RAW result | |
# adapted from https://gist.github.com/inactivist/5263501 | |
class MyModelParser(tweepy.parsers.ModelParser): | |
def parse(self, method, payload): | |
return payload | |
# result = super(MyModelParser, self).parse(method, payload) | |
# result._payload = json.loads(payload) | |
# return result | |
d = dict( | |
consumer_key = "", | |
consumer_secret = "", | |
access_token = "", | |
access_token_secret = "", | |
) | |
# api key information with the same keys as above | |
# d = json.load(open("/home/brenocon/.twitter_app")) | |
auth = tweepy.OAuthHandler(d['consumer_key'], d['consumer_secret']) | |
auth.set_access_token(d['access_token'], d['access_token_secret']) | |
api = tweepy.API(auth, parser=MyModelParser()) | |
ids_to_get = sys.stdin.read().split() | |
print>>sys.stderr, "%s IDs to get" % len(ids_to_get) | |
while ids_to_get: | |
id = ids_to_get[0] | |
try: | |
tweet = api.get_status(id) | |
print "%s\t%s" % (id, tweet) | |
ids_to_get = ids_to_get[1:] | |
except tweepy.error.TweepError as e: | |
if 'Rate limit exceeded' in repr(e) or (isinstance(e.message, list) and isinstance(e[0],dict) and e[0].get('code')==88): | |
print>>sys.stderr, "cooldown then will retry id %s" % id | |
time.sleep(COOLDOWN_PAUSE_SECONDS) | |
continue | |
# unfixable error | |
print>>sys.stderr, "Error for id %s; continuing. Error information saved to output." % id | |
print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': e.message})) | |
ids_to_get = ids_to_get[1:] | |
except Exception as e: | |
print>>sys.stderr, "Unknown exception. Continuing. Saving info to output." | |
print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': repr(e)})) | |
ids_to_get = ids_to_get[1:] | |
sys.stdout.flush() | |
time.sleep(STANDARD_PAUSE_SECONDS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment