Skip to content

Instantly share code, notes, and snippets.

@mynameisfiber
Created September 28, 2013 20:09
Show Gist options
  • Select an option

  • Save mynameisfiber/6746047 to your computer and use it in GitHub Desktop.

Select an option

Save mynameisfiber/6746047 to your computer and use it in GitHub Desktop.
simple twitter archiver
#!/usr/bin/env python2.7
"""
Get a users's timeline and saves it into flat json files, one file per hour. A
good way of running this would be setting up a cronjob that runs every 5
minutes, ie:
( twitter_archive.py >> timeline.log ) || (echo "twitter_archive failed" | mail -s "twitter archive" me@example.com)
"""
import TwitterAPI
import cPickle
import time
import os
import ujson as json
from conf import credentials
api = TwitterAPI.TwitterAPI(**credentials)
FILE_BASE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
def archive_endpoint(endpoint, name):
try:
last_tid = cPickle.load(open(os.path.join(FILE_BASE, "%s_state.pkl" % name)))
except IOError:
last_tid = None
filename = os.path.join(FILE_BASE, time.strftime("%%Y/%%m/%%e/%s-%%H.json" % name))
dirname = os.path.dirname(filename)
try:
os.makedirs(dirname)
except Exception:
pass
print "Getting %s" % name
parameters = {
"count" : 200,
"include_entities" : True,
}
if last_tid:
parameters["since_id"] = last_tid
data = api.request(endpoint, parameters)
print "Saving to %s" % filename
fd = open(filename, "a+")
max_tid = None
num_items = 0
for item in data.get_iterator():
max_tid = max(item["id"], max_tid)
fd.write(json.dumps(item) + "\n")
num_items += 1
print "Saved %d items (max_tid = %r, last_tid = %r)" % (num_items, max_tid, last_tid)
if max_tid:
last_tid = max_tid
cPickle.dump(last_tid, open(os.path.join(FILE_BASE, "%s_state.pkl" % name), "w+"))
if __name__ == "__main__":
archive_endpoint("statuses/home_timeline", "timeline")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment