Created
January 4, 2015 18:08
-
-
Save arulrajnet/0b71842b573d81c7bc23 to your computer and use it in GitHub Desktop.
Dump all tweets as CSV of given twitter handle using #Tweepy https://github.com/tweepy/tweepy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import tweepy # https://github.com/tweepy/tweepy | |
import json | |
import csv | |
import sys | |
# Twitter API credentials | |
consumer_key = "your_consumer_key" | |
consumer_secret = "your_consumer_secret" | |
access_key = "your_access_key" | |
access_secret = "your_access_secret" | |
# Converts to Tweepy Status object to JSON and add that as json attribute | |
# Quick monkeypatch https://gist.github.com/misja/1183424 | |
@classmethod | |
def parse(cls, api, raw): | |
status = cls.first_parse(api, raw) | |
setattr(status, 'json', json.dumps(raw)) | |
return status | |
tweepy.models.Status.first_parse = tweepy.models.Status.parse | |
tweepy.models.Status.parse = parse | |
def get_all_tweets(screen_name): | |
# Twitter only allows access to a users most recent 3240 tweets with this method | |
# authorize twitter, initialize tweepy | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_key, access_secret) | |
api = tweepy.API(auth) | |
# make initial request for most recent tweets (200 is the maximum allowed count) | |
new_tweets = api.user_timeline(screen_name=screen_name, count=200) | |
# Writing into file | |
out_file = open('%s_tweets.csv' % screen_name, 'wb') | |
writer = csv.DictWriter(out_file, fieldnames=["id", "created_at", "text"], extrasaction='ignore', | |
delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
writer.writeheader() | |
for tweet in new_tweets: | |
t = json.loads(tweet.json) | |
if "text" in t: | |
t["text"] = t["text"].encode("utf-8") | |
writer.writerow(t) | |
# save the id of the oldest tweet less one | |
oldest = new_tweets[-1].id - 1 | |
no_of_tweets = len(new_tweets) | |
# keep grabbing tweets until there are no tweets left to grab | |
while len(new_tweets) > 0: | |
print "\t getting tweets before %s" % oldest | |
# all subsequent requests use the max_id param to prevent duplicates | |
new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) | |
if new_tweets is None or len(new_tweets) == 0: | |
continue | |
# update the id of the oldest tweet less one | |
oldest = new_tweets[-1].id - 1 | |
no_of_tweets += len(new_tweets) | |
# Writing into file | |
for tweet in new_tweets: | |
t = json.loads(tweet.json) | |
if "text" in t: | |
t["text"] = t["text"].encode("utf-8") | |
writer.writerow(t) | |
print "\t ...%s tweets downloaded so far" % no_of_tweets | |
def usage(): | |
print "Usage :" | |
print """\t python %s @twitter_handle""" % sys.argv[0] | |
print "\t OR" | |
print """\t python %s '["@twitter_handle","twitter_handle1", "@twitter_handle2"]'""" % sys.argv[0] | |
if __name__ == '__main__': | |
# pass in the username or list of the account you want to download | |
# Array of handles get from xpath in chrome $x("//span[contains(@class,'username js-action-profile-name')]/text()") | |
if len(sys.argv) == 1: | |
usage() | |
sys.exit() | |
given_value = sys.argv[1] | |
try: | |
_handles_ = json.loads(given_value) | |
for twitter_handle in _handles_: | |
twitter_handle = twitter_handle.replace("@", "") | |
print "Downloading Tweets of %s" % twitter_handle | |
try: | |
get_all_tweets(twitter_handle) | |
except Exception, e: | |
print "Error in Downloading %s, %s" % (twitter_handle, e) | |
continue | |
except Exception, e: | |
# If not a JSON document | |
print "Its not an JSON array. So trying ordinary way." | |
given_value = given_value.replace("@", "") | |
print "Downloading Tweets of %s" % given_value | |
get_all_tweets(given_value) | |
print "Done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For me it worked like:
Is it limitation of
3240
tweets? And how to grab all tweets?Also why you substruct
1
fromid
of last tweet? likeoldest = new_tweets[-1].id - 1
, maybe it should be justoldest = new_tweets[-1].id
?Ids look more like random numbers and not like serial numbers.