Skip to content

Instantly share code, notes, and snippets.

@marcosan93
Created February 25, 2022 01:37
Show Gist options
  • Save marcosan93/40bcae898002637cb610e037ea88e67d to your computer and use it in GitHub Desktop.
Save marcosan93/40bcae898002637cb610e037ea88e67d to your computer and use it in GitHub Desktop.
def getTweets(search_term, until, since, limit=20):
"""
Configures Twint and returns a dataframe of tweets for a specific day.
"""
# Configuring Twint for search
c = twint.Config()
# The limit of tweets to retrieve
c.Limit = limit
# Search term
c.Search = search_term
# Removing retweets
c.Filter_retweets = True
# Popular tweets
c.Popular_tweets = True
# Verified users only
c.Verified = True
# Lowercasing tweets
c.Lowercase = False
# English only
c.Lang = 'en'
# Tweets until a specified date
c.Until = until + " 00:00:00"
# Tweets since a specified date
c.Since = since + " 00:00:00"
# Making the results pandas friendly
c.Pandas = True
# Stopping print in terminal
c.Hide_output = True
# Searching
twint.run.Search(c)
# Assigning the DF
df = twint.storage.panda.Tweets_df
# Returning an empty DF if no tweets were found
if len(df)<=0:
return pd.DataFrame()
# Formatting the date
df['date'] = df['date'].apply(lambda x: x.split(" ")[0])
# Returning with english filter to account for an issue with the twint language filter
return df[df['language']=='en']
def tweetByDay(start, end, df, search, limit=20):
"""
Runs the twint query everyday between the given dates and returns
the total dataframe.
Start is the first date in the past.
End is the last date (usually would be current date)
"""
# Finishing the recursive loop
if start==end:
# Removing any potential duplicates
df = df.drop_duplicates(subset="id")
print(len(df))
return df
# Appending the new set of tweets for specified window of time
tweet_df = getTweets(
search,
until=(datetime.strptime(start, "%Y-%m-%d") + timedelta(days=2)).strftime("%Y-%m-%d"),
since=start,
limit=limit
)
# Running the query a few more times in case twint missed some tweets
run = 0
while len(tweet_df)==0 and run<=2:
# Running query again
tweet_df = getTweets(
search,
until=(datetime.strptime(start, "%Y-%m-%d") + timedelta(days=2)).strftime("%Y-%m-%d"),
since=start,
limit=limit
)
# Counting how many times it ran
run += 1
# Adding the new tweets
df = df.append(tweet_df, ignore_index=True)
# Updating the new start date
new_start = (datetime.strptime(start, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
# Printing scraping status
print(f"\t{len(df)} Total Tweets collected as of {new_start}\t")
# Running the function again
return tweetByDay(
start=new_start,
end=end,
df=df,
search=search
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment