Last active
August 15, 2016 22:03
-
-
Save om-henners/442da9129e8cab6b583a22827e60ae2c to your computer and use it in GitHub Desktop.
Download pyconau tweets with Twython. Because of twitter's rate limiting this will take a while to execute. Also, the files wind up pretty large (~300mb per rate limited block)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import print_function | |
from datetime import date, timedelta, datetime | |
import json | |
from time import sleep | |
from twython import Twython, TwythonRateLimitError | |
APP_KEY = 'YOUR_APP_KEY' | |
APP_SECRET = 'YOUR_APP_SECRET' | |
twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) | |
ACCESS_TOKEN = twitter.obtain_access_token() | |
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) | |
querystring = '#pyconau OR @pyconau since:{:%Y-%m-%d}'.format(date.today() - timedelta(days=8)) | |
# technically, the max is 7 days, but 8 just to quickly get around timezone shenannigans | |
querystring_user = 'from:pyconau since:{:%Y-%m-%d}'.format(date.today() - timedelta(days=8)) | |
with open('pyconau_2016_tweets.json', 'wb') as f: | |
tweet_ids = set() # avoid adding duplicate tweets if possible | |
for qs in [querystring, querystring_user]: | |
max_id = None | |
while True: | |
some_results = False | |
try: | |
cursor = twitter.cursor(twitter.search, q=qs, count=100, max_id=max_id) | |
for result in cursor: | |
if result['id'] in tweet_ids: | |
continue | |
f.write(json.dumps(result)) | |
f.write('\n') | |
max_id = result['id'] | |
some_results = True | |
tweet_ids.add(result['id']) | |
except TwythonRateLimitError as e: | |
pass | |
print(max_id) | |
if not some_results: | |
# Assume if there are no results, we've finally run out of tweets to be returned by the search query | |
break | |
# rate limit is 180 requests every 15 minutes. So sleep in between timeouts... | |
sleep(60*16) # 16 minutes to be on the safe side | |
print("Done", qs, datetime.now().isoformat()) | |
print("Done @pyconau tweets", datetime.now().isoformat()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment