Skip to content

Instantly share code, notes, and snippets.

@AnasAlmasri
Last active November 9, 2020 04:37
Show Gist options
  • Save AnasAlmasri/853f0af319f3938754bdd447b8c56302 to your computer and use it in GitHub Desktop.
Save AnasAlmasri/853f0af319f3938754bdd447b8c56302 to your computer and use it in GitHub Desktop.
full code
import twitter
# initialize api instance
twitter_api = twitter.Api(consumer_key='YOUR_CONSUMER_KEY',
consumer_secret='YOUR_CONSUMER_SECRET',
access_token_key='YOUR_ACCESS_TOKEN_KEY',
access_token_secret='YOUR_ACCESS_TOKEN_SECRET')
# test authentication
print(twitter_api.VerifyCredentials())
# ------------------------------------------------------------------------
def buildTestSet(search_keyword):
try:
tweets_fetched = twitter_api.GetSearch(search_keyword, count=100)
print("Fetched " + str(len(tweets_fetched)) + " tweets for the term " + search_keyword)
return [{"text":status.text, "label":None} for status in tweets_fetched]
except:
print("Unfortunately, something went wrong..")
return None
# ------------------------------------------------------------------------
search_term = input("Enter a search keyword: ")
testDataSet = buildTestSet(search_term)
print(testDataSet[0:4])
# ------------------------------------------------------------------------
def buildTrainingSet(corpusFile, tweetDataFile):
import csv
import time
corpus=[]
with open(corpusFile,'rb') as csvfile:
lineReader = csv.reader(csvfile, delimiter=',', quotechar="\"")
for row in lineReader:
corpus.append({"tweet_id":row[2], "label":row[1], "topic":row[0]})
rate_limit=180
sleep_time=900/180
trainingDataSet=[]
for tweet in corpus:
try:
status = twitter_api.GetStatus(tweet["tweet_id"])
print("Tweet fetched" + status.text)
tweet["text"] = status.text
trainingDataSet.append(tweet)
time.sleep(sleep_time)
except:
continue
# Now we write them to the empty CSV file
with open(tweetDataFile,'wb') as csvfile:
linewriter=csv.writer(csvfile,delimiter=',',quotechar="\"")
for tweet in trainingDataSet:
try:
linewriter.writerow([tweet["tweet_id"],tweet["text"],tweet["label"],tweet["topic"]])
except Exception as e:
print(e)
return trainingDataSet
# ------------------------------------------------------------------------
corpusFile = "YOUR_FILE_PATH/corpus.csv"
tweetDataFile = "YOUR_FILE_PATH/tweetDataFile.csv"
trainingData = buildTrainingSet(corpusFile, tweetDataFile)
# ------------------------------------------------------------------------
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet["text"]),tweet["label"]))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
# ------------------------------------------------------------------------
import nltk
def buildVocabulary(preprocessedTrainingData):
all_words = []
for (words, sentiment) in preprocessedTrainingData:
all_words.extend(words)
wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()
return word_features
# ------------------------------------------------------------------------
def extract_features(tweet):
tweet_words=set(tweet)
features={}
for word in word_features:
features['contains(%s)' % word]=(word in tweet_words)
return features
# ------------------------------------------------------------------------
# Now we can extract the features and train the classifier
word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures=nltk.classify.apply_features(extract_features,preprocessedTrainingSet)
# ------------------------------------------------------------------------
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
# ------------------------------------------------------------------------
NBResultLabels = [NBayesClassifier.classify(extract_features(tweet[0])) for tweet in preprocessedTestSet]
# ------------------------------------------------------------------------
# get the majority vote
if NBResultLabels.count('positive') > NBResultLabels.count('negative'):
print("Overall Positive Sentiment")
print("Positive Sentiment Percentage = " + str(100*NBResultLabels.count('positive')/len(NBResultLabels)) + "%")
else:
print("Overall Negative Sentiment")
print("Negative Sentiment Percentage = " + str(100*NBResultLabels.count('negative')/len(NBResultLabels)) + "%")
@msrana172
Copy link

how get these files: corpusFile, tweetDataFile
please mention the details

@PurplePrincess14
Copy link

TypeError: 'NoneType' object is not subscriptable

this error appears when you try and run this

search_term = input("Enter a search keyword: ")
testDataSet = buildTestSet(search_term)

print(testDataSet[0:4])

@diegoallanalves
Copy link

trainingData = buildTrainingSet(corpusFile, tweetDataFile)

the code above is not working. Help pls.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment