Created
November 26, 2014 00:45
-
-
Save Inpirical-Coder/3a529e47f677248d2862 to your computer and use it in GitHub Desktop.
Benchmarking sentiment scoring algorithms for twitter using precision, recall, F-measure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Short scripts for testing three different sentiment classifiers on tweets, | |
# acquiring the tweets used for testing, | |
# calculating systems' precision, recall and F-measures. | |
require(RCurl) # For downloading file from a given URL. | |
require(twitteR) # Used for the 'twitter' class. | |
require(sentiment) # For bayes and voter classifiers. | |
source("sent140.R") # Used for the Sentiment 140 API. Can be downloaded from here: | |
# https://github.com/okugami79/sentiment140/blob/master/R/sentiment.r | |
load("twit_cred.Rdat") | |
registerTwitterOAuth(twit.cred) | |
GetSandersCorpus = function() { | |
# Download the sanders corpus, save it down. | |
download.file( | |
url = "http://www.sananalytics.com/lab/twitter-sentiment/sanders-twitter-0.2.zip", | |
destfile = "sanders_twitter-0.2.zip" | |
) | |
file.name = "sanders-twitter-0.2/corpus.csv" | |
unzip( | |
zipfile = "sanders_twitter-0.2.zip", | |
files = c(file.name) | |
) | |
san.dat = read.csv( | |
file = file.name, | |
stringsAsFactors = FALSE, | |
header = FALSE | |
) | |
colnames(san.dat) = c("term", "clas", "id") | |
san.dat$id = as.character(san.dat$id) | |
save(san.dat, file="data/san_dat.Rdat") | |
} | |
not.found.message = "Error: Not Found" # Message used to identify tweets no longer avaliable. | |
# Purge "irrelevant tweets" | |
san.dat = san.dat[san.dat$clas != "irrelevant", ] | |
FillSanTweets = function() { | |
# Pulls tweets using the API based on the Tiwtter ids. | |
sapply(setdiff(san.dat$id, names(san.tweets)), function(tid) { | |
tryCatch({ | |
san.tweets[tid] <<- showStatus(tid) | |
print(paste("Successfully added tweet no", tid)) | |
}, | |
error=function(e) { | |
msg = geterrmessage() | |
if(msg == not.found.message) { | |
print(paste("Tweet no", tid, "longer available")) | |
san.tweets[tid] <<- NA | |
} else{ | |
print(paste("FAILED to fetch Tweet no.", tid)) | |
} | |
} | |
) | |
Sys.sleep(15) # delay the next request so we stay within the 180 calls per hour restriction. | |
}) | |
} | |
# Create a new tweet list (empty). | |
san.tweets = list() | |
# Start populating the list by calling FillSanTweets. | |
FillSanTweets() | |
#source("twitter.R") | |
#dat = read.csv("data/full-corpus.csv", stringsAsFactors=FALSE) | |
colnames(dat) = c("term", "sent", "tid", "date", "txt") | |
dat$tid = as.character(dat$tid) | |
# "Purge" the irrelevant entries | |
dat = dat[dat$sent!="irrelevant",] | |
dat$txt = ScrubTweets(dat$txt) | |
dat = cbind(stringsAsFactors=FALSE, | |
dat, | |
classify_polarity(dat$txt, algorithm="voter"), | |
classify_polarity(dat$txt, algorithm="bayes"), | |
sentiment(dat$txt)[ , 2] | |
) | |
colnames(dat)[6:9] = c("v.pos", "v.neg", "v.ratio", "v.best") | |
colnames(dat)[10:13] = c("b.pos", "b.neg", "b.ratio", "b.best") | |
colnames(dat)[14] = "sent140" | |
dat$v.ratio = as.numeric(dat$v.ratio) | |
dat$b.ratio = as.numeric(dat$b.ratio) | |
Distribution = function() { | |
# Display pc. distribution across categories. | |
sapply(list("manual"=dat$sent, "voter"=dat$v.best, "bayes"=dat$b.best, "sent140"=dat$sent140), | |
function(x) { | |
round(table(x) / nrow(dat) * 100, digits=2) | |
} | |
) | |
} | |
ConfusionMatrices = function() { | |
# Calculate the confusion matrices for the three different algorithms we | |
# are benchmarking. | |
c( | |
"voter" = table(dat$sent, dat$v.best), | |
"bayes" = table(dat$sent, dat$b.best), | |
"sent140" = table(dat$sent, dat$sent140) | |
) | |
} | |
PrecisionRate = function() { | |
# Calculate the precision of the two algorithms with respect to the three | |
# categories. | |
sapply(c("negative", "neutral", "positive"), function(clas) { | |
c( | |
"voter" = mean(dat[dat$v.best == clas, ]$sent == clas), | |
"bayes" = mean(dat[dat$b.best == clas, ]$sent == clas), | |
"sent140" = mean(dat[dat$sent140 == clas, ]$sent == clas) | |
) | |
}) | |
} | |
RecallRate = function() { | |
# Calculate the precision of the two algorithms with respect to the three | |
# categories. | |
sapply(c("negative", "neutral", "positive"), function(bin) { | |
c( | |
"voter" = mean(dat[dat$sent == bin, ]$v.best == bin), | |
"bayes" = mean(dat[dat$sent == bin, ]$b.best == bin), | |
"sent140" = mean(dat[dat$sent == bin, ]$sent140 == bin) | |
) | |
}) | |
} | |
FMeasures = function() { | |
# Calculates the f-measures for every classifier / class combination. | |
p = PrecisionRate() | |
r = RecallRate() | |
# Return the harmonic mean of precision and recall | |
2 * p * r / (p + r) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment