-
-
Save skandavivek/516f5387123718b632aef6827c3ad1f9 to your computer and use it in GitHub Desktop.
import requests | |
import os | |
import json | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from matplotlib import pyplot as plt | |
import numpy as np | |
import time | |
def get_tweets(st,et,words,ns_tokens=[],j0=0): | |
# building the tweets dataframes | |
headers = { | |
"Authorization": "Bearer BLAH"} | |
if len(ns_tokens)>0: | |
ns_tokens=[ns_tokens] | |
for j in range(j0, 500): | |
tweet_lst = [] | |
users_lst = [] | |
# places_lst=[] | |
for i in range(0, 100): | |
#print(j,i) | |
query_params = {'query': words, 'expansions': 'author_id', | |
'tweet.fields': 'id,created_at,author_id,text,public_metrics', | |
'user.fields': 'username', 'max_results': '100', 'start_time': st, 'end_time': et} | |
if (i > 0) or (j > 0): | |
#print(ns_tokens[-1]) | |
query_params = {'query': words, 'next_token': ns_tokens[-1], 'expansions': 'author_id', | |
'tweet.fields': 'id,created_at,author_id,text,public_metrics', | |
'user.fields': 'username', 'max_results': '100', 'start_time': st, 'end_time': et} | |
response = requests.request("GET", search_url, headers=headers, params=query_params) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
s1 = str(soup) | |
js = json.loads(s1) | |
#print(js) | |
try: | |
js2 = js['meta'] | |
except: | |
print(i) | |
print(js) | |
js2 = js['meta'] | |
if ('next_token' in js2.keys()): | |
ns = js2['next_token'] | |
# print('next',ns) | |
ns_tokens.append(ns) | |
else: | |
ns_df = pd.DataFrame(ns_tokens) | |
for tweet in js['data']: | |
tweet_lst.append([tweet['created_at'], tweet['id'], tweet['author_id'], tweet['text'], \ | |
tweet['public_metrics']['retweet_count'], tweet['public_metrics']['reply_count'], \ | |
tweet['public_metrics']['like_count'], tweet['public_metrics']['quote_count']]) | |
for users in js['includes']['users']: | |
users_lst.append([users['id'], users['username']]) | |
users_df = pd.DataFrame(users_lst, columns=['id', 'username']) | |
# places_df=pd.DataFrame(places_lst, columns=['id','full_name','name','geo','place_type']) | |
tweet_df = pd.DataFrame(tweet_lst, | |
columns=['time', 'id', 'author_id', 'text', 'retweet', 'reply', 'like', | |
'quote']) | |
tweet_df.to_csv( | |
'tweet_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
users_df.to_csv( | |
'users_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
# places_df.to_csv('/content/gdrive/MyDrive/twitterAPI/powerout/places_df'+str(j*100)+'to'+str((j+1)*100-1)+st+et+'.csv') | |
ns_df.to_csv( | |
'ns_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
print('wow', i, j) | |
ns_tokens = [] | |
for tweet in js['data']: | |
tweet_lst.append([tweet['created_at'], tweet['id'], tweet['author_id'], tweet['text'], \ | |
tweet['public_metrics']['retweet_count'], tweet['public_metrics']['reply_count'], \ | |
tweet['public_metrics']['like_count'], tweet['public_metrics']['quote_count']]) | |
for users in js['includes']['users']: | |
users_lst.append([users['id'], users['username']]) | |
# for places in js['includes']['places']: | |
# places_lst.append([places['id'],places['full_name'],places['name'],places['geo'],places['place_type']]) | |
time.sleep(5) # to make sure within the rate limit, 300 requests per 15 minutes | |
ns_df = pd.DataFrame(ns_tokens) | |
users_df = pd.DataFrame(users_lst, columns=['id', 'username']) | |
# places_df=pd.DataFrame(places_lst, columns=['id','full_name','name','geo','place_type']) | |
tweet_df = pd.DataFrame(tweet_lst, | |
columns=['time', 'id', 'author_id', 'text', 'retweet', 'reply', 'like', 'quote']) | |
tweet_df.to_csv( | |
'tweet_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
users_df.to_csv( | |
'users_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
# places_df.to_csv('/content/gdrive/MyDrive/twitterAPI/powerout/places_df'+str(j*100)+'to'+str((j+1)*100-1)+st+et+'.csv') | |
ns_df.to_csv('ns_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
ns_tokens = ns_tokens[-2:] | |
print(j, len(tweet_lst)) | |
# Press the green button in the gutter to run the script. | |
if __name__ == '__main__': | |
bearer_token = os.environ.get("BEARER_TOKEN") | |
search_url = "https://api.twitter.com/2/tweets/search/all" | |
#if you have standard access, you can use the search url below instead, to search recent tweets from a week before | |
#search_url = "https://api.twitter.com/2/tweets/search/recent" | |
st = '2021-07-27T00:00:00Z' | |
et = '2021-07-29T14:22:00Z' | |
words = 'Simone Biles' | |
get_tweets(st,et,words) | |
Hi @vicharbour852, I ran this code with standard access, which seemed to work:
search_url = "https://api.twitter.com/2/tweets/search/recent"
st = '2021-08-05T00:00:00Z'
et = '2021-08-06T00:00:00Z'
Make sure to update the Bearer token I tried this with standard access, and it works just fine. Also your error seems like a python error, and not an API error.
The way I have it setup now, when all the tweets are captured within the range, it gives an error and exits as "print('wow', i, j)"
Did you happen to see if csv files containing tweet information are being saved on your computer?
Hi @skandavivek, I did see csv files and see wow, i,j message, so it's actually successfully run?
Then I guess my question is with the second code to do the plot/graph analysis, where it throws below error. I have tweet file up to '1100 to 1199' in my local drive. Sorry for the basic question.. thank you very much.
FileNotFoundError: [Errno 2] No such file or directory: 'tweet_dfSimone Biles1200to12992021-07-30T12021-07-31T1.csv'
Hi @skandavivek, thank you for posting this nice piece on medium. I am a new learner on twitter analysis and was trying to run your code. I have a basic standard account so have changed the search_url to https://api.twitter.com/2/tweets/search/recent and have changed the date range. But I run into a "IndexError: list index out of range". Do you mind to point me to the error & fix? Much appreciated, thank you!!