Last active
August 7, 2021 08:37
-
-
Save skandavivek/516f5387123718b632aef6827c3ad1f9 to your computer and use it in GitHub Desktop.
Obtaining 400k+ Tweets on 'Simone Biles' during the 2020 Tokyo Olympics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import json | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from matplotlib import pyplot as plt | |
import numpy as np | |
import time | |
def get_tweets(st,et,words,ns_tokens=[],j0=0): | |
# building the tweets dataframes | |
headers = { | |
"Authorization": "Bearer BLAH"} | |
if len(ns_tokens)>0: | |
ns_tokens=[ns_tokens] | |
for j in range(j0, 500): | |
tweet_lst = [] | |
users_lst = [] | |
# places_lst=[] | |
for i in range(0, 100): | |
#print(j,i) | |
query_params = {'query': words, 'expansions': 'author_id', | |
'tweet.fields': 'id,created_at,author_id,text,public_metrics', | |
'user.fields': 'username', 'max_results': '100', 'start_time': st, 'end_time': et} | |
if (i > 0) or (j > 0): | |
#print(ns_tokens[-1]) | |
query_params = {'query': words, 'next_token': ns_tokens[-1], 'expansions': 'author_id', | |
'tweet.fields': 'id,created_at,author_id,text,public_metrics', | |
'user.fields': 'username', 'max_results': '100', 'start_time': st, 'end_time': et} | |
response = requests.request("GET", search_url, headers=headers, params=query_params) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
s1 = str(soup) | |
js = json.loads(s1) | |
#print(js) | |
try: | |
js2 = js['meta'] | |
except: | |
print(i) | |
print(js) | |
js2 = js['meta'] | |
if ('next_token' in js2.keys()): | |
ns = js2['next_token'] | |
# print('next',ns) | |
ns_tokens.append(ns) | |
else: | |
ns_df = pd.DataFrame(ns_tokens) | |
for tweet in js['data']: | |
tweet_lst.append([tweet['created_at'], tweet['id'], tweet['author_id'], tweet['text'], \ | |
tweet['public_metrics']['retweet_count'], tweet['public_metrics']['reply_count'], \ | |
tweet['public_metrics']['like_count'], tweet['public_metrics']['quote_count']]) | |
for users in js['includes']['users']: | |
users_lst.append([users['id'], users['username']]) | |
users_df = pd.DataFrame(users_lst, columns=['id', 'username']) | |
# places_df=pd.DataFrame(places_lst, columns=['id','full_name','name','geo','place_type']) | |
tweet_df = pd.DataFrame(tweet_lst, | |
columns=['time', 'id', 'author_id', 'text', 'retweet', 'reply', 'like', | |
'quote']) | |
tweet_df.to_csv( | |
'tweet_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
users_df.to_csv( | |
'users_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
# places_df.to_csv('/content/gdrive/MyDrive/twitterAPI/powerout/places_df'+str(j*100)+'to'+str((j+1)*100-1)+st+et+'.csv') | |
ns_df.to_csv( | |
'ns_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
print('wow', i, j) | |
ns_tokens = [] | |
for tweet in js['data']: | |
tweet_lst.append([tweet['created_at'], tweet['id'], tweet['author_id'], tweet['text'], \ | |
tweet['public_metrics']['retweet_count'], tweet['public_metrics']['reply_count'], \ | |
tweet['public_metrics']['like_count'], tweet['public_metrics']['quote_count']]) | |
for users in js['includes']['users']: | |
users_lst.append([users['id'], users['username']]) | |
# for places in js['includes']['places']: | |
# places_lst.append([places['id'],places['full_name'],places['name'],places['geo'],places['place_type']]) | |
time.sleep(5) # to make sure within the rate limit, 300 requests per 15 minutes | |
ns_df = pd.DataFrame(ns_tokens) | |
users_df = pd.DataFrame(users_lst, columns=['id', 'username']) | |
# places_df=pd.DataFrame(places_lst, columns=['id','full_name','name','geo','place_type']) | |
tweet_df = pd.DataFrame(tweet_lst, | |
columns=['time', 'id', 'author_id', 'text', 'retweet', 'reply', 'like', 'quote']) | |
tweet_df.to_csv( | |
'tweet_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
users_df.to_csv( | |
'users_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
# places_df.to_csv('/content/gdrive/MyDrive/twitterAPI/powerout/places_df'+str(j*100)+'to'+str((j+1)*100-1)+st+et+'.csv') | |
ns_df.to_csv('ns_df' + words.replace(':', '') + str(j * 100) + 'to' + str((j + 1) * 100 - 1) + st[:12] + et[ | |
:12] + '.csv') | |
ns_tokens = ns_tokens[-2:] | |
print(j, len(tweet_lst)) | |
# Press the green button in the gutter to run the script. | |
if __name__ == '__main__': | |
bearer_token = os.environ.get("BEARER_TOKEN") | |
search_url = "https://api.twitter.com/2/tweets/search/all" | |
#if you have standard access, you can use the search url below instead, to search recent tweets from a week before | |
#search_url = "https://api.twitter.com/2/tweets/search/recent" | |
st = '2021-07-27T00:00:00Z' | |
et = '2021-07-29T14:22:00Z' | |
words = 'Simone Biles' | |
get_tweets(st,et,words) | |
Hi @skandavivek, I did see csv files and see wow, i,j message, so it's actually successfully run?
Then I guess my question is with the second code to do the plot/graph analysis, where it throws below error. I have tweet file up to '1100 to 1199' in my local drive. Sorry for the basic question.. thank you very much.
FileNotFoundError: [Errno 2] No such file or directory: 'tweet_dfSimone Biles1200to12992021-07-30T12021-07-31T1.csv'
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @vicharbour852, I ran this code with standard access, which seemed to work:
search_url = "https://api.twitter.com/2/tweets/search/recent"
st = '2021-08-05T00:00:00Z'
et = '2021-08-06T00:00:00Z'
Make sure to update the Bearer token I tried this with standard access, and it works just fine. Also your error seems like a python error, and not an API error.
The way I have it setup now, when all the tweets are captured within the range, it gives an error and exits as "print('wow', i, j)"
Did you happen to see if csv files containing tweet information are being saved on your computer?