khaledadrani · December 21, 2021 17:52
diff --git a/filter_tweets_quality.py b/filter_tweets_quality.py
 import re

 def filter_tweets(selected):
  '''
  Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
  And also filter out any tweet that is longer than 200 characters. 
  '''
  filtered = []
  url_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  for text in selected.text:
    string = re.sub(r''+str(url_pattern), '', text, flags=re.MULTILINE)
    if not string.strip().endswith('...') and not string.strip().endswith('…'):
      if len(string)<200:
        filtered.append(string)
    
  return filtered


 filtered = filter_tweets(tweets)
 len(filtered)

 #get a list of tickers and company names
 stocks = list(tickers.ticker) + list(tickers[' name'])

 res = []
 #only keep tweets that mention any ticker or company name
 for text in filtered:
  for s in stocks:
    if s in text.split():
      res.append(text) 

 len(res)
	import re

	def filter_tweets(selected):
	'''
	Filter out any tweet that ends with three dots (indicating it is linking to an external source and thus lacking in information)
	And also filter out any tweet that is longer than 200 characters.
	'''
	filtered = []
	url_pattern = "http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
	for text in selected.text:
	string = re.sub(r''+str(url_pattern), '', text, flags=re.MULTILINE)
	if not string.strip().endswith('...') and not string.strip().endswith('…'):
	if len(string)<200:
	filtered.append(string)

	return filtered


	filtered = filter_tweets(tweets)
	len(filtered)

	#get a list of tickers and company names
	stocks = list(tickers.ticker) + list(tickers[' name'])

	res = []
	#only keep tweets that mention any ticker or company name
	for text in filtered:
	for s in stocks:
	if s in text.split():
	res.append(text)

	len(res)
No results found