bohdanszymanik · December 19, 2015 12:38
diff --git a/RTweetsAnalysis.R b/RTweetsAnalysis.R
 # download tweets using twitter search api, combine text and count +ve -ve words to get sentiment
 # minor modifications to the code taken from
 # http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/

 library(RCurl)
 library(twitteR)
 library(ROAuth)

 requestURL <- "https://api.twitter.com/oauth/request_token"
 accessURL = "http://api.twitter.com/oauth/access_token"
 authURL = "http://api.twitter.com/oauth/authorize"
 consumerKey = "someKey"
 consumerSecret = "someSecret"
 Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=requestURL,
                         accessURL=accessURL, 
                         authURL=authURL)
 #The next command provides a URL which you will need to copy and paste into your favourite browser
 #Assuming you are logged into Twitter you will then be provided a PIN number to type into the R command line

 # This doesn't work with RStudio at the time of writing - use RGui and once done, save the Credentials - from that time on
 # you can use RStudio (thank goodness)

 Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl") )
 # Checks that you are authorised
 registerTwitterOAuth(Cred)
 # from http://stackoverflow.com/questions/9916283/twitter-roauth-and-windows-register-ok-but-certificate-verify-failed
 # save to reuse credentials in future
 save(list="Cred", file="twitteR_credentials")

 load("twitteR_credentials")
 registerTwitterOAuth(Cred)

 # notice you need to add the full path to the cacert file - at least I did...
 va.tweets = searchTwitter('@virginatlantic', n = 1500,  cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")
 aa.tweets = searchTwitter('@AmericanAir', n = 1500,  cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")

 library(plyr)
 va.text = laply(va.tweets, function(t) t$getText() )
 aa.text = laply(aa.tweets, function(t) t$getText() )

 # hmmm, R dosen't like funny characters so strip em out
 va.text = gsub("[^[:alnum:]|^[:space:]]", "", va.text)
 aa.text = gsub("[^[:alnum:]|^[:space:]]", "", aa.text)

 hu.liu.pos.words = scan('c:\\wd\\positive-words.txt', what='character', comment.char=';')
 hu.liu.neg.words = scan('c:\\wd\\negative-words.txt', what='character', comment.char=';')

 # some terms I've noticed that could be substituted in:
 # lift your game
 # i am unable
 # oh gee thats just fantastic
 # site down again
 # still waiting on response

 # add in emoticons
 pos.words = c(hu.liu.pos.words, 'smiley')
 neg.words = c(hu.liu.neg.words, 'unsmiley')

 substitute.emoticons = function(text) {
  smiled <- gsub(':\\)', 'smiley', text)
  return(gsub(':\\(', 'unsmiley', smiled))
 }

 score.sentiment = function(sentences, pos.words, neg.words, .progress='none')    
 {  
  require(plyr)  
  require(stringr)  
  
  # we got a vector of sentences. plyr will handle a list  
  # or a vector as an "l" for us  
  # we want a simple array ("a") of scores back, so we use   
  # "l" + "a" + "ply" = "laply":  
  
  scores = laply(sentences, function(sentence, pos.words, neg.words) {  
    
    # clean up sentences with R's regex-driven global substitute, gsub():  
    sentence = gsub('[[:punct:]]', '', sentence)  
    sentence = gsub('[[:cntrl:]]', '', sentence)  
    sentence = gsub('\\d+', '', sentence)  
    
    # and convert to lower case:  
    sentence = tolower(sentence)  
    
    # split into words. str_split is in the stringr package  
    word.list = str_split(sentence, '\\s+')  
    
    # sometimes a list() is one level of hierarchy too much  
    words = unlist(word.list)  
    
    # compare our words to the dictionaries of positive & negative terms  
    pos.matches = match(words, pos.words)  
    neg.matches = match(words, neg.words)  
    
    # match() returns the position of the matched term or NA  
    # we just want a TRUE/FALSE:  
    pos.matches = !is.na(pos.matches)  
    neg.matches = !is.na(neg.matches)  
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():  
    score = sum(pos.matches) - sum(neg.matches)  
    
    return(score)  
  }, pos.words, neg.words, .progress=.progress )  
  
  scores.df = data.frame(score=scores, text=sentences)  
  return(scores.df)  
 } 

 test = c("you're awesome and I love you :)", "you're awful and I hate you. This is terrible service.", "I'm both impressed and amazed")
 result = score.sentiment(substitute.emoticons(test), pos.words, neg.words)

 # hmmm, I've had to clean the punctuation characters out - there were some errors from bnz tweets
 # they didn't have to do this in the original...
 va.scores = score.sentiment(substitute.emoticons(va.text), pos.words, neg.words, .progress='text')
 aa.scores = score.sentiment(substitute.emoticons(aa.text), pos.words, neg.words, .progress='text')

 library(ggplot2)
 # add identifying variable for each bank
 va.scores["Company"] <- "Virgin Atlantic" # one way
 aa.scores$Company = 'American Airlines' # another

 # combine scores and plot stacked
 all.scores = rbind( aa.scores, va.scores)

 ggplot(data = all.scores) +
  geom_bar(mapping = aes(x=score, fill=Company), binwidth=1 ) +
  facet_grid(Company~.) +
  theme_bw()

 # let's explore word associations using tm: http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/
 library(tm)

 # build corpus
 cp <- Corpus(VectorSource(all.scores$text))

 # clean
 cp <- tm_map(cp, tolower)
 cp <- tm_map(cp, removePunctuation)
 cp <- tm_map(cp, removeNumbers)

 # tm includes stopwords lists for languages but it misses some important common examples
 customStopwords <- c(stopwords('english'), "available", "via")

 # feel free to view stopwords for some other languages eg dutch, french, italian, german
 cp <- tm_map(cp, removeWords, customStopwords)

 # stem if necessary but you get some oddities eg mining stems to mine
 # requires packages Snowball, RWeka, rJava, RWekajars
 # example of what it can do for better or worse: stemDocument(c("mining", "miners"))
 #cp <- tm_map(cp, stemDocument)

 # build the term document matrix
 td <- TermDocumentMatrix(cp, control = list(minWordLength=1))

 # we have a LOT of sparse terms... let's get rid of them
 tds <- removeSparseTerms(td, 0.99)

 # what's getting mentioned most?
 findFreqTerms(tds, lowfreq = 10)

 # let's uncover some associations
 findAssocs(tds, 'awesome', 0.05)

 library(wordcloud)
 m <- as.matrix(tds)
 v <- sort(rowSums(m), decreasing=TRUE)
 cloudNames <- names(v)
 d <- data.frame(word=cloudNames, freq=v)
 wordcloud(d$word, d$freq, min.freq=10)

 inspect(tds[1:10,1:10])

 # let's examine term adjacency and plot a graph of connectivity: http://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/
 # transform sparse termDocumentMatrix to an actual matrix, make boolean, then transform to adjacency matrix
 tdsm <- as.matrix(tds)
 tdsm[tdsm>=1] <- 1
 termMatrix <- tdsm %*% t(tdsm)

 library(igraph)
 g <- graph.adjacency(termMatrix, weighted=TRUE, mode="undirected")

 # remove loops
 g <- simplify(g)
 # set labels and degrees of vertices
 V(g)$label <- V(g)$name
 V(g)$degree <- degree(g)

 set.seed(3952)
 layout1 <- layout.fruchterman.reingold(g)
 plot(g, layout=layout1)
 plot(g, layout=layout.kamada.kawai)
 tkplot(g, layout=layout.kamada.kawai)

 V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2
 V(g)$label.color <- rgb(0, 0, .2, .8)
 V(g)$frame.color <- NA
 egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
 E(g)$color <- rgb(.5, .5, 0, egam)
 E(g)$width <- egam
 plot(g, layout=layout1)
 tkplot(g, layout=layout1)

 plot(termMatrix, terms = findFreqTerms(termMatrix, lowfreq = 5), corThreshold = 0.5)
	# download tweets using twitter search api, combine text and count +ve -ve words to get sentiment
	# minor modifications to the code taken from
	# http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/

	library(RCurl)
	library(twitteR)
	library(ROAuth)

	requestURL <- "https://api.twitter.com/oauth/request_token"
	accessURL = "http://api.twitter.com/oauth/access_token"
	authURL = "http://api.twitter.com/oauth/authorize"
	consumerKey = "someKey"
	consumerSecret = "someSecret"
	Cred <- OAuthFactory$new(consumerKey=consumerKey,
	consumerSecret=consumerSecret,
	requestURL=requestURL,
	accessURL=accessURL,
	authURL=authURL)
	#The next command provides a URL which you will need to copy and paste into your favourite browser
	#Assuming you are logged into Twitter you will then be provided a PIN number to type into the R command line

	# This doesn't work with RStudio at the time of writing - use RGui and once done, save the Credentials - from that time on
	# you can use RStudio (thank goodness)

	Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl") )
	# Checks that you are authorised
	registerTwitterOAuth(Cred)
	# from http://stackoverflow.com/questions/9916283/twitter-roauth-and-windows-register-ok-but-certificate-verify-failed
	# save to reuse credentials in future
	save(list="Cred", file="twitteR_credentials")

	load("twitteR_credentials")
	registerTwitterOAuth(Cred)

	# notice you need to add the full path to the cacert file - at least I did...
	va.tweets = searchTwitter('@virginatlantic', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")
	aa.tweets = searchTwitter('@AmericanAir', n = 1500, cainfo="C:\\Users\\Bohdan\\Documents\\R\\win-library\\3.0\\RCurl\\CurlSSL\\cacert.pem")

	library(plyr)
	va.text = laply(va.tweets, function(t) t$getText() )
	aa.text = laply(aa.tweets, function(t) t$getText() )

	# hmmm, R dosen't like funny characters so strip em out
	va.text = gsub("[^[:alnum:]\|^[:space:]]", "", va.text)
	aa.text = gsub("[^[:alnum:]\|^[:space:]]", "", aa.text)

	hu.liu.pos.words = scan('c:\\wd\\positive-words.txt', what='character', comment.char=';')
	hu.liu.neg.words = scan('c:\\wd\\negative-words.txt', what='character', comment.char=';')

	# some terms I've noticed that could be substituted in:
	# lift your game
	# i am unable
	# oh gee thats just fantastic
	# site down again
	# still waiting on response

	# add in emoticons
	pos.words = c(hu.liu.pos.words, 'smiley')
	neg.words = c(hu.liu.neg.words, 'unsmiley')

	substitute.emoticons = function(text) {
	smiled <- gsub(':\\)', 'smiley', text)
	return(gsub(':\\(', 'unsmiley', smiled))
	}

	score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
	{
	require(plyr)
	require(stringr)

	# we got a vector of sentences. plyr will handle a list
	# or a vector as an "l" for us
	# we want a simple array ("a") of scores back, so we use
	# "l" + "a" + "ply" = "laply":

	scores = laply(sentences, function(sentence, pos.words, neg.words) {

	# clean up sentences with R's regex-driven global substitute, gsub():
	sentence = gsub('[[:punct:]]', '', sentence)
	sentence = gsub('[[:cntrl:]]', '', sentence)
	sentence = gsub('\\d+', '', sentence)

	# and convert to lower case:
	sentence = tolower(sentence)

	# split into words. str_split is in the stringr package
	word.list = str_split(sentence, '\\s+')

	# sometimes a list() is one level of hierarchy too much
	words = unlist(word.list)

	# compare our words to the dictionaries of positive & negative terms
	pos.matches = match(words, pos.words)
	neg.matches = match(words, neg.words)

	# match() returns the position of the matched term or NA
	# we just want a TRUE/FALSE:
	pos.matches = !is.na(pos.matches)
	neg.matches = !is.na(neg.matches)

	# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
	score = sum(pos.matches) - sum(neg.matches)

	return(score)
	}, pos.words, neg.words, .progress=.progress )

	scores.df = data.frame(score=scores, text=sentences)
	return(scores.df)
	}

	test = c("you're awesome and I love you :)", "you're awful and I hate you. This is terrible service.", "I'm both impressed and amazed")
	result = score.sentiment(substitute.emoticons(test), pos.words, neg.words)

	# hmmm, I've had to clean the punctuation characters out - there were some errors from bnz tweets
	# they didn't have to do this in the original...
	va.scores = score.sentiment(substitute.emoticons(va.text), pos.words, neg.words, .progress='text')
	aa.scores = score.sentiment(substitute.emoticons(aa.text), pos.words, neg.words, .progress='text')

	library(ggplot2)
	# add identifying variable for each bank
	va.scores["Company"] <- "Virgin Atlantic" # one way
	aa.scores$Company = 'American Airlines' # another

	# combine scores and plot stacked
	all.scores = rbind( aa.scores, va.scores)

	ggplot(data = all.scores) +
	geom_bar(mapping = aes(x=score, fill=Company), binwidth=1 ) +
	facet_grid(Company~.) +
	theme_bw()

	# let's explore word associations using tm: http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/
	library(tm)

	# build corpus
	cp <- Corpus(VectorSource(all.scores$text))

	# clean
	cp <- tm_map(cp, tolower)
	cp <- tm_map(cp, removePunctuation)
	cp <- tm_map(cp, removeNumbers)

	# tm includes stopwords lists for languages but it misses some important common examples
	customStopwords <- c(stopwords('english'), "available", "via")

	# feel free to view stopwords for some other languages eg dutch, french, italian, german
	cp <- tm_map(cp, removeWords, customStopwords)

	# stem if necessary but you get some oddities eg mining stems to mine
	# requires packages Snowball, RWeka, rJava, RWekajars
	# example of what it can do for better or worse: stemDocument(c("mining", "miners"))
	#cp <- tm_map(cp, stemDocument)

	# build the term document matrix
	td <- TermDocumentMatrix(cp, control = list(minWordLength=1))

	# we have a LOT of sparse terms... let's get rid of them
	tds <- removeSparseTerms(td, 0.99)

	# what's getting mentioned most?
	findFreqTerms(tds, lowfreq = 10)

	# let's uncover some associations
	findAssocs(tds, 'awesome', 0.05)

	library(wordcloud)
	m <- as.matrix(tds)
	v <- sort(rowSums(m), decreasing=TRUE)
	cloudNames <- names(v)
	d <- data.frame(word=cloudNames, freq=v)
	wordcloud(d$word, d$freq, min.freq=10)

	inspect(tds[1:10,1:10])

	# let's examine term adjacency and plot a graph of connectivity: http://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/
	# transform sparse termDocumentMatrix to an actual matrix, make boolean, then transform to adjacency matrix
	tdsm <- as.matrix(tds)
	tdsm[tdsm>=1] <- 1
	termMatrix <- tdsm %*% t(tdsm)

	library(igraph)
	g <- graph.adjacency(termMatrix, weighted=TRUE, mode="undirected")

	# remove loops
	g <- simplify(g)
	# set labels and degrees of vertices
	V(g)$label <- V(g)$name
	V(g)$degree <- degree(g)

	set.seed(3952)
	layout1 <- layout.fruchterman.reingold(g)
	plot(g, layout=layout1)
	plot(g, layout=layout.kamada.kawai)
	tkplot(g, layout=layout.kamada.kawai)

	V(g)$label.cex <- 2.2 * V(g)$degree / max(V(g)$degree)+ .2
	V(g)$label.color <- rgb(0, 0, .2, .8)
	V(g)$frame.color <- NA
	egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
	E(g)$color <- rgb(.5, .5, 0, egam)
	E(g)$width <- egam
	plot(g, layout=layout1)
	tkplot(g, layout=layout1)

	plot(termMatrix, terms = findFreqTerms(termMatrix, lowfreq = 5), corThreshold = 0.5)