Skip to content

Instantly share code, notes, and snippets.

@brendano
Last active August 8, 2017 21:47
Show Gist options
  • Save brendano/fa7fae9e8a3ad40500474aa89b226039 to your computer and use it in GitHub Desktop.
Save brendano/fa7fae9e8a3ad40500474aa89b226039 to your computer and use it in GitHub Desktop.
r"""
stdin: IDs of tweets to get (whitespace or line separated)
stdout: the tweets as two-column TSV: ID \t TweetJSON
This retrieves tweets using the API.
If there was an error when retrieving a message - most prominently, if the
message is now deleted -- the error information is saved as JSON. Therefore
there should be exactly as many output lines as there are input IDs.
This script tries to be robust about rate limiting, though this is not
extensively tested.
Related: Jimmy Lin's twitter-tools (formerly twitter-corpus-tools)
knows (or used to know) how to retrieve tweets based on scraping the HTML
and/or embedded JSON within the HTML from the website.
https://github.com/lintool/twitter-tools
"""
import tweepy,sys,json,time
# https://dev.twitter.com/rest/public/rate-limits
# seems to indicate 15*60/900 = 1 request per second is OK
STANDARD_PAUSE_SECONDS = 1.0
COOLDOWN_PAUSE_SECONDS = 30.0
# don't parse. return RAW result
# adapted from https://gist.github.com/inactivist/5263501
class MyModelParser(tweepy.parsers.ModelParser):
def parse(self, method, payload):
return payload
# result = super(MyModelParser, self).parse(method, payload)
# result._payload = json.loads(payload)
# return result
d = dict(
consumer_key = "",
consumer_secret = "",
access_token = "",
access_token_secret = "",
)
# api key information with the same keys as above
# d = json.load(open("/home/brenocon/.twitter_app"))
auth = tweepy.OAuthHandler(d['consumer_key'], d['consumer_secret'])
auth.set_access_token(d['access_token'], d['access_token_secret'])
api = tweepy.API(auth, parser=MyModelParser())
ids_to_get = sys.stdin.read().split()
print>>sys.stderr, "%s IDs to get" % len(ids_to_get)
while ids_to_get:
id = ids_to_get[0]
try:
tweet = api.get_status(id)
print "%s\t%s" % (id, tweet)
ids_to_get = ids_to_get[1:]
except tweepy.error.TweepError as e:
if 'Rate limit exceeded' in repr(e) or (isinstance(e.message, list) and isinstance(e[0],dict) and e[0].get('code')==88):
print>>sys.stderr, "cooldown then will retry id %s" % id
time.sleep(COOLDOWN_PAUSE_SECONDS)
continue
# unfixable error
print>>sys.stderr, "Error for id %s; continuing. Error information saved to output." % id
print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': e.message}))
ids_to_get = ids_to_get[1:]
except Exception as e:
print>>sys.stderr, "Unknown exception. Continuing. Saving info to output."
print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': repr(e)}))
ids_to_get = ids_to_get[1:]
sys.stdout.flush()
time.sleep(STANDARD_PAUSE_SECONDS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment