Created
June 22, 2018 01:21
-
-
Save JeremyEnglert/3eda4a123244c37b669472d9e8166ea6 to your computer and use it in GitHub Desktop.
Python Machine Learning Sentiment Classifier using NLTK Twitter Corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import twitter_samples | |
##### | |
##### SENTIMENENT FUNCTION | |
##### | |
def naiveBayesSentimentCalculator(review): | |
problemInstance = review.split() | |
problemFeatures = extract_features(problemInstance) | |
return trainedNBClassifer.classify(problemFeatures) | |
##### | |
##### CREATE, TRAIN AND TEST CLASSIFIER | |
##### | |
# Save positive tweets from nltk corpus into positiveTweets list | |
positiveTweets = twitter_samples.strings('positive_tweets.json') | |
# Save negative tweets from nltk corpus into negativeTweets list | |
negativeTweets = twitter_samples.strings('negative_tweets.json') | |
# Split review data into two parts for training and testing | |
testTrainingSplitIndex = 2500 | |
# Grab all reviews in the range of 0 to testTrainingSplitIndex | |
# This data is used to train the classifier | |
trainingNegativeTweets = negativeTweets[:testTrainingSplitIndex] | |
trainingPositiveTweets = positiveTweets[:testTrainingSplitIndex] | |
# Grab all reviews in the range of testTrainingSplitIndex to the end | |
# This model is used to test the classifier | |
testNegativeTweets = negativeTweets[testTrainingSplitIndex+1:] | |
testPositiveTweets = positiveTweets[testTrainingSplitIndex+1:] | |
# Break up data into words to define vocabulary | |
# Lists are created using nested for loop | |
# https://www.reddit.com/r/learnpython/comments/8ro4aj/help_me_understand_multiple_for_loops_inside_of_a/ | |
def getVocabulary(): | |
negativeWordList = [word for line in trainingNegativeTweets for word in line.split()] | |
positiveWordList = [word for line in trainingPositiveTweets for word in line.split()] | |
allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist] | |
allWordSet = list(set(allWordList)) | |
vocabulary = allWordSet | |
return vocabulary | |
vocabulary = getVocabulary() | |
def getTrainingData(): | |
negTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'negative'} for oneReview in trainingNegativeTweets] | |
posTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'positive'} for oneReview in trainingPositiveTweets] | |
fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList, posTaggedTrainingReviewList] for item in sublist] | |
trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData] | |
return trainingData | |
trainingData = getTrainingData() | |
# Add reviews to review_words list | |
# Check if word exists in vocabulary | |
# This will return a diictionary of all words in the vocabulary | |
# The key will be the word in the vocabulary | |
# The value will be true/false, depending on if the word from the vocabulary is in the review | |
def extract_features(review): | |
review_words=set(review) | |
features={} | |
for word in vocabulary: # Loop through every word in the vocabulary | |
features[word]=(word in review_words) # If word from vocabulary matches a word from the review, mark as true | |
return features | |
def getTrainedNaiveBayesClassifer(extract_features, trainingData): | |
trainingFeatures = nltk.classify.apply_features(extract_features, trainingData) | |
trainedNBClassifer = nltk.NaiveBayesClassifier.train(trainingFeatures) | |
return trainedNBClassifer | |
trainedNBClassifer = getTrainedNaiveBayesClassifer(extract_features, trainingData) | |
def getTesttReviewSentiments(naiveBayesSentimentCalculator): | |
testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeTweets] | |
testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveTweets] | |
labelToNum = {'positive': 1, 'negative': -1} | |
numericNegResults = [labelToNum[x] for x in testNegResults] | |
numericPosResults = [labelToNum[x] for x in testPosResults] | |
return {'results-on-positive' :numericPosResults, 'results-on-negative' :numericNegResults} | |
def runDiagnostics(reviewResult): | |
positiveReviewsResult = reviewResult['results-on-positive'] | |
negativeReviewsResult = reviewResult['results-on-negative'] | |
numTruePositive = sum(x > 0 for x in positiveReviewsResult) | |
numTrueNegative = sum(x < 0 for x in negativeReviewsResult) | |
pctTruePositive = float(numTruePositive)/len(positiveReviewsResult) | |
pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult) | |
totalAccurate = numTruePositive + numTrueNegative | |
total = len(positiveReviewsResult) + len(negativeReviewsResult) | |
print ("Accuracy on positive tweets = " +"%.2f" % (pctTruePositive*100) + "%") | |
print ("Accurance on negative tweets = " +"%.2f" % (pctTrueNegative*100) + "%") | |
print ("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%") | |
runDiagnostics(getTesttReviewSentiments(naiveBayesSentimentCalculator)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment