Skip to content

Instantly share code, notes, and snippets.

@marcosan93
Last active September 13, 2021 00:10
Show Gist options
  • Select an option

  • Save marcosan93/93cabd9fb7b345d911ff361d979dedf6 to your computer and use it in GitHub Desktop.

Select an option

Save marcosan93/93cabd9fb7b345d911ff361d979dedf6 to your computer and use it in GitHub Desktop.
def tweetByDay(start, end, df, search, limit=20):
"""
Runs the twint query everyday between the given dates and returns
the total dataframe.
"""
# Finishing the recursive loop
if start==end:
# Removing any potential duplicates
df = df.drop_duplicates(subset="id")
print(len(df))
return df
# Getting the new set of tweets for the day
tweet_df = getTweets(search, end, limit)
# Running the query a few more times in case twint missed some tweets
run = 0
while len(tweet_df)==0 or run<=2:
# Running query again
tweet_df = getTweets(search, end, limit)
# Counting how many times it ran
run += 1
# Pausing for a bit
time.sleep(1)
# Adding the new tweets
df = df.append(tweet_df, ignore_index=True)
# Updating the new end date
new_end = (datetime.strptime(end, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
# Printing scraping status
print(f"\t{len(df)} Total Tweets collected as of {new_end}\t")
# Running the function again
return tweetByDay(start, new_end, df, search)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment