Skip to content

Instantly share code, notes, and snippets.

@khaledadrani
Last active December 21, 2021 17:52
Show Gist options
  • Select an option

  • Save khaledadrani/4fccbe507e03db61bc83a229778d2b7d to your computer and use it in GitHub Desktop.

Select an option

Save khaledadrani/4fccbe507e03db61bc83a229778d2b7d to your computer and use it in GitHub Desktop.
Filter_bad_tweets
import re
def filter_tweets(selected):
'''
Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
And also filter out any tweet that is longer than 200 characters.
'''
filtered = []
url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
for text in selected.text:
string = re.sub(r''+str(url_pattern), '', text, flags=re.MULTILINE)
if not string.strip().endswith('...') and not string.strip().endswith('…'):
if len(string)<200:
filtered.append(string)
return filtered
filtered = filter_tweets(tweets)
len(filtered)
#get a list of tickers and company names
stocks = list(tickers.ticker) + list(tickers[' name'])
res = []
#only keep tweets that mention any ticker or company name
for text in filtered:
for s in stocks:
if s in text.split():
res.append(text)
len(res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment