brendano · August 8, 2017 21:47
diff --git a/get_tweets_by_id.py b/get_tweets_by_id.py
 r"""
 stdin: IDs of tweets to get (whitespace or line separated)
 stdout: the tweets as two-column TSV:  ID \t TweetJSON

 This retrieves tweets using the API.

 If there was an error when retrieving a message - most prominently, if the
 message is now deleted -- the error information is saved as JSON.  Therefore
 there should be exactly as many output lines as there are input IDs.

 This script tries to be robust about rate limiting, though this is not
 extensively tested.

 Related: Jimmy Lin's twitter-tools (formerly twitter-corpus-tools)
 knows (or used to know) how to retrieve tweets based on scraping the HTML
 and/or embedded JSON within the HTML from the website.
 https://github.com/lintool/twitter-tools
 """
 import tweepy,sys,json,time

 # https://dev.twitter.com/rest/public/rate-limits
 # seems to indicate 15*60/900 = 1 request per second is OK

 STANDARD_PAUSE_SECONDS = 1.0
 COOLDOWN_PAUSE_SECONDS = 30.0

 # don't parse. return RAW result
 # adapted from https://gist.github.com/inactivist/5263501
 class MyModelParser(tweepy.parsers.ModelParser):
    def parse(self, method, payload):
        return payload
        # result = super(MyModelParser, self).parse(method, payload)
        # result._payload = json.loads(payload)
        # return result

 d = dict(
 consumer_key = "",
 consumer_secret = "",
 access_token = "",
 access_token_secret = "",
 )

 # api key information with the same keys as above
 # d = json.load(open("/home/brenocon/.twitter_app"))


 auth = tweepy.OAuthHandler(d['consumer_key'], d['consumer_secret'])
 auth.set_access_token(d['access_token'], d['access_token_secret'])
 api = tweepy.API(auth, parser=MyModelParser())


 ids_to_get = sys.stdin.read().split()
 print>>sys.stderr, "%s IDs to get" % len(ids_to_get)
 while ids_to_get:
    id = ids_to_get[0]
    try:
        tweet = api.get_status(id)
        print "%s\t%s" % (id, tweet)
        ids_to_get = ids_to_get[1:]
    except tweepy.error.TweepError as e:
        if 'Rate limit exceeded' in repr(e) or (isinstance(e.message, list) and isinstance(e[0],dict) and e[0].get('code')==88):
            print>>sys.stderr, "cooldown then will retry id %s" % id
            time.sleep(COOLDOWN_PAUSE_SECONDS)
            continue
        # unfixable error
        print>>sys.stderr, "Error for id %s; continuing. Error information saved to output." % id
        print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': e.message}))
        ids_to_get = ids_to_get[1:]
    except Exception as e:
        print>>sys.stderr, "Unknown exception. Continuing. Saving info to output."
        print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': repr(e)}))
        ids_to_get = ids_to_get[1:]
    sys.stdout.flush()
    time.sleep(STANDARD_PAUSE_SECONDS)
	r"""
	stdin: IDs of tweets to get (whitespace or line separated)
	stdout: the tweets as two-column TSV: ID \t TweetJSON

	This retrieves tweets using the API.

	If there was an error when retrieving a message - most prominently, if the
	message is now deleted -- the error information is saved as JSON. Therefore
	there should be exactly as many output lines as there are input IDs.

	This script tries to be robust about rate limiting, though this is not
	extensively tested.

	Related: Jimmy Lin's twitter-tools (formerly twitter-corpus-tools)
	knows (or used to know) how to retrieve tweets based on scraping the HTML
	and/or embedded JSON within the HTML from the website.
	https://github.com/lintool/twitter-tools
	"""
	import tweepy,sys,json,time

	# https://dev.twitter.com/rest/public/rate-limits
	# seems to indicate 15*60/900 = 1 request per second is OK

	STANDARD_PAUSE_SECONDS = 1.0
	COOLDOWN_PAUSE_SECONDS = 30.0

	# don't parse. return RAW result
	# adapted from https://gist.github.com/inactivist/5263501
	class MyModelParser(tweepy.parsers.ModelParser):
	def parse(self, method, payload):
	return payload
	# result = super(MyModelParser, self).parse(method, payload)
	# result._payload = json.loads(payload)
	# return result

	d = dict(
	consumer_key = "",
	consumer_secret = "",
	access_token = "",
	access_token_secret = "",
	)

	# api key information with the same keys as above
	# d = json.load(open("/home/brenocon/.twitter_app"))


	auth = tweepy.OAuthHandler(d['consumer_key'], d['consumer_secret'])
	auth.set_access_token(d['access_token'], d['access_token_secret'])
	api = tweepy.API(auth, parser=MyModelParser())


	ids_to_get = sys.stdin.read().split()
	print>>sys.stderr, "%s IDs to get" % len(ids_to_get)
	while ids_to_get:
	id = ids_to_get[0]
	try:
	tweet = api.get_status(id)
	print "%s\t%s" % (id, tweet)
	ids_to_get = ids_to_get[1:]
	except tweepy.error.TweepError as e:
	if 'Rate limit exceeded' in repr(e) or (isinstance(e.message, list) and isinstance(e[0],dict) and e[0].get('code')==88):
	print>>sys.stderr, "cooldown then will retry id %s" % id
	time.sleep(COOLDOWN_PAUSE_SECONDS)
	continue
	# unfixable error
	print>>sys.stderr, "Error for id %s; continuing. Error information saved to output." % id
	print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': e.message}))
	ids_to_get = ids_to_get[1:]
	except Exception as e:
	print>>sys.stderr, "Unknown exception. Continuing. Saving info to output."
	print "%s\t%s" % (id, json.dumps({'id':int(id), 'error': repr(e)}))
	ids_to_get = ids_to_get[1:]
	sys.stdout.flush()
	time.sleep(STANDARD_PAUSE_SECONDS)