Last active
December 21, 2021 17:52
-
-
Save khaledadrani/4fccbe507e03db61bc83a229778d2b7d to your computer and use it in GitHub Desktop.
Filter_bad_tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def filter_tweets(selected): | |
| ''' | |
| Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information) | |
| And also filter out any tweet that is longer than 200 characters. | |
| ''' | |
| filtered = [] | |
| url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" | |
| for text in selected.text: | |
| string = re.sub(r''+str(url_pattern), '', text, flags=re.MULTILINE) | |
| if not string.strip().endswith('...') and not string.strip().endswith('…'): | |
| if len(string)<200: | |
| filtered.append(string) | |
| return filtered | |
| filtered = filter_tweets(tweets) | |
| len(filtered) | |
| #get a list of tickers and company names | |
| stocks = list(tickers.ticker) + list(tickers[' name']) | |
| res = [] | |
| #only keep tweets that mention any ticker or company name | |
| for text in filtered: | |
| for s in stocks: | |
| if s in text.split(): | |
| res.append(text) | |
| len(res) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment