Created
October 11, 2020 18:08
-
-
Save esenthil2018/f1bc81b8c60573de1ca5640222d72a2f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
TWITTER_KEY = 'Your twitter key' | |
TWITTER_SECRET_KEY = 'Your secret key' | |
# Authenticate | |
auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET_KEY) | |
api = tweepy.API(auth, wait_on_rate_limit=True, | |
wait_on_rate_limit_notify=True) | |
if (not api): | |
print ("Can't Authenticate") | |
sys.exit(-1) | |
#@title Twitter Search API Inputs | |
#@markdown ### Enter Search Query: | |
searchQuery = '#giraffe ' #@param {type:"string"} | |
#@markdown ### Enter Max Tweets To Scrape: | |
#@markdown #### The Twitter API Rate Limit (currently) is 45,000 tweets every 15 minutes. | |
maxTweets = 5000 #@param {type:"slider", min:0, max:45000, step:100} | |
Filter_Retweets = True #@param {type:"boolean"} | |
tweetsPerQry = 100 # this is the max the API permits | |
tweet_lst = [] | |
if Filter_Retweets: | |
searchQuery = searchQuery + ' -filter:retweets' # to exclude retweets | |
# If results from a specific ID onwards are reqd, set since_id to that ID. | |
# else default to no lower limit, go as far back as API allows | |
sinceId = None | |
# If results only below a specific ID are, set max_id to that ID. | |
# else default to no upper limit, start from the most recent tweet matching the search query. | |
max_id = -10000000000 | |
global vimage | |
tweetCount = 0 | |
print("Downloading max {0} tweets".format(maxTweets)) | |
while tweetCount < maxTweets: | |
try: | |
if (max_id <= 0): | |
if (not sinceId): | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, lang="en") | |
else: | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
lang="en", since_id=sinceId) | |
else: | |
if (not sinceId): | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
lang="en", max_id=str(max_id - 1)) | |
else: | |
new_tweets = api.search(q=searchQuery, count=tweetsPerQry, | |
lang="en", max_id=str(max_id - 1), | |
since_id=sinceId) | |
if not new_tweets: | |
print("No more tweets found") | |
break | |
for tweet in new_tweets: | |
if hasattr(tweet, 'reply_count'): | |
reply_count = tweet.reply_count | |
else: | |
reply_count = 0 | |
if hasattr(tweet, 'retweeted'): | |
retweeted = tweet.retweeted | |
else: | |
retweeted = "NA" | |
# fixup search query to get topic | |
topic = searchQuery[:searchQuery.find('-')].capitalize().strip() | |
# fixup date | |
tweetDate = tweet.created_at.date() | |
#url = tweet.entities["media"]["media_url"] | |
for media in tweet.entities.get("media",[{}]): | |
#checks if there is any media-entity | |
if media.get("type",None) == "photo": | |
print('test)') | |
vurl = media["media_url"] | |
# checks if the entity is of the type "photo" | |
vimage = requests.get(media["media_url"]) | |
#vurl = media["media_url"] | |
#print(url) | |
# save to file etc. | |
timage = vimage | |
turl = vurl | |
tweet_lst.append([tweetDate, topic, | |
tweet.id, tweet.user.screen_name, tweet.user.name, tweet.text, tweet.favorite_count, | |
reply_count,tweet.user.location,timage,turl,tweet.user.description, tweet.retweet_count, retweeted]) | |
tweetCount += len(new_tweets) | |
print("Downloaded {0} tweets".format(tweetCount)) | |
max_id = new_tweets[-1].id | |
except tweepy.TweepError as e: | |
# Just exit if any error | |
print("some error : " + str(e)) | |
break | |
clear_output() | |
print("Downloaded {0} tweets".format(tweetCount)) | |
pd.set_option('display.max_colwidth', -1) | |
# load it into a pandas dataframe | |
tweet_df = pd.DataFrame(tweet_lst, columns=['tweet_dt', 'topic', 'id', 'screenname','username', 'tweet', 'like_count', 'reply_count','location','timage','turl','description', 'retweet_count', 'retweeted']) | |
tweet_df.to_csv('tweets1.csv') | |
tweet_df.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment