Last active
December 19, 2015 12:38
-
-
Save bohdanszymanik/5955779 to your computer and use it in GitHub Desktop.
R Tweets Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download tweets using twitter search api, combine text and count +ve -ve words to get sentiment | |
# minor modifications to the code taken from | |
# http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/ | |
library(RCurl) | |
library(twitteR) | |
library(ROAuth) | |
requestURL <- "https://api.twitter.com/oauth/request_token" | |
accessURL = "http://api.twitter.com/oauth/access_token" | |
authURL = "http://api.twitter.com/oauth/authorize" | |
consumerKey = "someKey" | |
consumerSecret = "someSecret" | |
Cred <- OAuthFactory$new(consumerKey=consumerKey, | |
consumerSecret=consumerSecret, | |
requestURL=requestURL, | |
accessURL=accessURL, | |
authURL=authURL) | |
#The next command provides a URL which you will need to copy and paste into your favourite browser | |
#Assuming you are logged into Twitter you will then be provided a PIN number to type into the R command line | |
# This doesn't work with RStudio at the time of writing - use RGui and once done, save the Credentials - from that time on | |
# you can use RStudio (thank goodness) | |
Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl") ) | |
# Checks that you are authorised | |
registerTwitterOAuth(Cred) | |
# from http://stackoverflow.com/questions/9916283/twitter-roauth-and-windows-register-ok-but-certificate-verify-failed | |
# save to reuse credentials in future | |
save(list="Cred", file="twitteR_credentials") | |
load("twitteR_credentials") | |
registerTwitterOAuth(Cred) | |
# notice you need to add the full path to the cacert file - at least I did... | |
va.tweets = searchTwitter('@virginatlantic', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem") | |
aa.tweets = searchTwitter('@AmericanAir', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem") | |
library(plyr) | |
va.text = laply(va.tweets, function(t) t$getText() ) | |
aa.text = laply(aa.tweets, function(t) t$getText() ) | |
# hmmm, R dosen't like funny characters so strip em out | |
va.text = gsub("[^[:alnum:]|^[:space:]]", "", va.text) | |
aa.text = gsub("[^[:alnum:]|^[:space:]]", "", aa.text) | |
hu.liu.pos.words = scan('c:\\wd\\positive-words.txt', what='character', comment.char=';') | |
hu.liu.neg.words = scan('c:\\wd\\negative-words.txt', what='character', comment.char=';') | |
# some terms I've noticed that could be substituted in: | |
# lift your game | |
# i am unable | |
# oh gee thats just fantastic | |
# site down again | |
# still waiting on response | |
# add in emoticons | |
pos.words = c(hu.liu.pos.words, 'smiley') | |
neg.words = c(hu.liu.neg.words, 'unsmiley') | |
substitute.emoticons = function(text) { | |
smiled <- gsub(':\\)', 'smiley', text) | |
return(gsub(':\\(', 'unsmiley', smiled)) | |
} | |
score.sentiment = function(sentences, pos.words, neg.words, .progress='none') | |
{ | |
require(plyr) | |
require(stringr) | |
# we got a vector of sentences. plyr will handle a list | |
# or a vector as an "l" for us | |
# we want a simple array ("a") of scores back, so we use | |
# "l" + "a" + "ply" = "laply": | |
scores = laply(sentences, function(sentence, pos.words, neg.words) { | |
# clean up sentences with R's regex-driven global substitute, gsub(): | |
sentence = gsub('[[:punct:]]', '', sentence) | |
sentence = gsub('[[:cntrl:]]', '', sentence) | |
sentence = gsub('\\d+', '', sentence) | |
# and convert to lower case: | |
sentence = tolower(sentence) | |
# split into words. str_split is in the stringr package | |
word.list = str_split(sentence, '\\s+') | |
# sometimes a list() is one level of hierarchy too much | |
words = unlist(word.list) | |
# compare our words to the dictionaries of positive & negative terms | |
pos.matches = match(words, pos.words) | |
neg.matches = match(words, neg.words) | |
# match() returns the position of the matched term or NA | |
# we just want a TRUE/FALSE: | |
pos.matches = !is.na(pos.matches) | |
neg.matches = !is.na(neg.matches) | |
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): | |
score = sum(pos.matches) - sum(neg.matches) | |
return(score) | |
}, pos.words, neg.words, .progress=.progress ) | |
scores.df = data.frame(score=scores, text=sentences) | |
return(scores.df) | |
} | |
test = c("you're awesome and I love you :)", "you're awful and I hate you. This is terrible service.", "I'm both impressed and amazed") | |
result = score.sentiment(substitute.emoticons(test), pos.words, neg.words) | |
# hmmm, I've had to clean the punctuation characters out - there were some errors from bnz tweets | |
# they didn't have to do this in the original... | |
va.scores = score.sentiment(substitute.emoticons(va.text), pos.words, neg.words, .progress='text') | |
aa.scores = score.sentiment(substitute.emoticons(aa.text), pos.words, neg.words, .progress='text') | |
library(ggplot2) | |
# add identifying variable for each bank | |
va.scores["Company"] <- "Virgin Atlantic" # one way | |
aa.scores$Company = 'American Airlines' # another | |
# combine scores and plot stacked | |
all.scores = rbind( aa.scores, va.scores) | |
ggplot(data = all.scores) + | |
geom_bar(mapping = aes(x=score, fill=Company), binwidth=1 ) + | |
facet_grid(Company~.) + | |
theme_bw() | |
# let's explore word associations using tm: http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/ | |
library(tm) | |
# build corpus | |
cp <- Corpus(VectorSource(all.scores$text)) | |
# clean | |
cp <- tm_map(cp, tolower) | |
cp <- tm_map(cp, removePunctuation) | |
cp <- tm_map(cp, removeNumbers) | |
# tm includes stopwords lists for languages but it misses some important common examples | |
customStopwords <- c(stopwords('english'), "available", "via") | |
# feel free to view stopwords for some other languages eg dutch, french, italian, german | |
cp <- tm_map(cp, removeWords, customStopwords) | |
# stem if necessary but you get some oddities eg mining stems to mine | |
# requires packages Snowball, RWeka, rJava, RWekajars | |
# example of what it can do for better or worse: stemDocument(c("mining", "miners")) | |
#cp <- tm_map(cp, stemDocument) | |
# build the term document matrix | |
td <- TermDocumentMatrix(cp, control = list(minWordLength=1)) | |
# we have a LOT of sparse terms... let's get rid of them | |
tds <- removeSparseTerms(td, 0.99) | |
# what's getting mentioned most? | |
findFreqTerms(tds, lowfreq = 10) | |
# let's uncover some associations | |
findAssocs(tds, 'awesome', 0.05) | |
library(wordcloud) | |
m <- as.matrix(tds) | |
v <- sort(rowSums(m), decreasing=TRUE) | |
cloudNames <- names(v) | |
d <- data.frame(word=cloudNames, freq=v) | |
wordcloud(d$word, d$freq, min.freq=10) | |
inspect(tds[1:10,1:10]) | |
# let's examine term adjacency and plot a graph of connectivity: http://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/ | |
# transform sparse termDocumentMatrix to an actual matrix, make boolean, then transform to adjacency matrix | |
tdsm <- as.matrix(tds) | |
tdsm[tdsm>=1] <- 1 | |
termMatrix <- tdsm %*% t(tdsm) | |
library(igraph) | |
g <- graph.adjacency(termMatrix, weighted=TRUE, mode="undirected") | |
# remove loops | |
g <- simplify(g) | |
# set labels and degrees of vertices | |
V(g)$label <- V(g)$name | |
V(g)$degree <- degree(g) | |
set.seed(3952) | |
layout1 <- layout.fruchterman.reingold(g) | |
plot(g, layout=layout1) | |
plot(g, layout=layout.kamada.kawai) | |
tkplot(g, layout=layout.kamada.kawai) | |
V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2 | |
V(g)$label.color <- rgb(0, 0, .2, .8) | |
V(g)$frame.color <- NA | |
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4) | |
E(g)$color <- rgb(.5, .5, 0, egam) | |
E(g)$width <- egam | |
plot(g, layout=layout1) | |
tkplot(g, layout=layout1) | |
plot(termMatrix, terms = findFreqTerms(termMatrix, lowfreq = 5), corThreshold = 0.5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment