Skip to content

Instantly share code, notes, and snippets.

@emhart
Last active August 29, 2015 13:57
Show Gist options
  • Save emhart/9399250 to your computer and use it in GitHub Desktop.
Save emhart/9399250 to your computer and use it in GitHub Desktop.
Create twitter analytics for social media, an intro
library(twitteR)
library(RCurl)
library(plyr)
library(tm)
library(wordcloud)
library(ggplot2)
### Great now let's figure out a way to highlight rapid acceleration of tweets
### Let's write a function that will take a vector of posix dates, a time threshold (in seconds), and assign them indexes
### dates: a vector of POSIXct dates
### thresh: a minimum span of time to consider it a run on tweets. Default is 100 seconds
### blockSize: The minimum number of tweets in a row to consider in a run of rapid tweets, Default is 10
rapidTweets <- function(dates, thresh = 100, blockSize = 10){
index <- rep(0,length(dates))
group <- 1
indexTemp <- vector()
timeDiffs <- abs(diff(dates))
for(i in 1:(length(timeDiffs)-1)){
if(timeDiffs[i] <= thresh && timeDiffs[i+1] <= thresh){
indexTemp <- c(indexTemp,i)
}
if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) >= blockSize){
indexTemp <- c(indexTemp,i)
index[indexTemp] <- group
indexTemp <- vector()
group <- group + 1
}
if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) < blockSize){
indexTemp <- vector()
}
}
return(index)
}
#### Standard word cloud code, I've made it a function just to make life a bit easier.
### textVector: a character vector
### myStopwords: some extra stopwords you might want
### returnTDM: if True return the TDM so you can see frequent words
createWC <- function(textVector,myStopwords, returnTDM = FALSE){
### This is pretty boiler plate stuff
# Create a corpus of words
myCorpus <- Corpus(VectorSource(textVector))
# Here we'll lowercase everything, strip punctionation, and remove stop words
myCorpus <- tm_map(myCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
### Create stopwords list and strip them out
myStopwords <- c(stopwords('english'),myStopwords)
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
### Stem words
dictCorpus <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
### Next we create out term document matrix
mytdm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 2))
m <- as.matrix(mytdm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
pal <- colorRampPalette(c("red","blue"))(10)
wordcloud(d$word, d$freq, min.freq=3,colors=pal,random.order=FALSE)
if(returnTDM){
return(mytdm)
}
}
# Set SSL certs globally
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
#### This is all the code you want to authorize with the twitter API and then grab tweets!
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
consumerKey <- "YOUR_KEY"
consumerSecret <- "YOUR_SECRET"
twitCred <- OAuthFactory$new(consumerKey=consumerKey,consumerSecret=consumerSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL)
### Cover your bases if you don't have certificates installed
twitCred$handshake()
registerTwitterOAuth(twitCred)
scioSearch <- searchTwitter(searchString = "@naturalsciences", n = 1000)
###### Get everything into dataframes
fullText <- ldply(scioSearch, function(x) return(x$text))
retweets <- ldply(scioSearch, function(x) return(x$retweetCount))
## Grab
wasRetweet <- ldply(scioSearch, function(x) !is.na(grep("RT",x$text)[1]))
### View the fraction of retweets
print(sum(was_retweet)/length(scioSearch))
### We could also see the rate of tweeting
# Extract user names
screenName <- ldply(scioSearch, function(x) return(x$screenName))
### Extract time stamps
dates <- ldply(scioSearch, function(x) return(as.POSIXct(x$created)))
### Create tweet date data frame
tweetDF <- data.frame(screenName,dates,fullText)
### Add accumulation
tweetDF$tweetN <- dim(tweetDF)[1]:1
tweetDF$diffs <- rapidTweets(dates$V1,thresh=2500)
colnames(tweetDF) <- c("screenName","date","fullText","tweetN","groups")
ggplot(tweetDF,aes(x=date,y=tweetN))+geom_point() theme_bw(20) + xlab("Date") + ylab("Cumulative tweet number")
twitStopwords <- c("http","https","t.co","co","naturalsciences","amp","today","\n","diff","one","talk","museum")
textVect <- paste(tweetDF$fullText,collapse=" ")
createWC(textVect,twitStopwords)
### Now let's subset by our data by runs of tweets, let's look at group 1
g1 <- paste(tweetDF$fullText[tweetDF$group==1],collapse=" ")
outTDM <- createWC(g1,twitStopwords,returnTDM=T)
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
fwordsg1 <- paste(names(freqWords[1:4]),collapse = ", ")
### Now let's subset by our data by runs of tweets, let's look at group 2
g2 <- paste(tweetDF$fullText[tweetDF$group==2],collapse=" ")
outTDM <- createWC(g2,twitStopwords,returnTDM=T)
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
fwordsg2 <-paste(names(freqWords[1:4]),collapse = ", ")
### Now let's subset by our data by runs of tweets, let's look at group 3
g3 <- paste(tweetDF$fullText[tweetDF$group==3],collapse=" ")
outTDM <- createWC(g3,twitStopwords,returnTDM=T)
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
fwordsg3 <- paste(names(freqWords[1:4]),collapse = ", ")
tplot <- ggplot(tweetDF,aes(x=date,y=tweetN,colour = as.factor(groups)))+geom_point() +theme_bw(20) + scale_colour_manual("Rapid Tweet \n groups",values = c("black","red","blue","orange")) + xlab("Date") +ylab("Cumulative tweet number")
### Assemble data frame for text plots
### offset the dates for plotting, and do this out by hand because sapply doesn't return posix dates
textTweetN <- sapply(1:3, function(x) median(tweetDF$tweetN[tweetDF$group==x]))
### Play around with this to get it right
offs <- 100000
textTweetDate <- c(median(tweetDF$date[tweetDF$group==1]) + offs ,median(tweetDF$date[tweetDF$group==2]) + offs,median(tweetDF$date[tweetDF$group==3]) + offs)
textDF <- data.frame(c(fwordsg1,fwordsg2,fwordsg3),textTweetN,textTweetDate,1:3)
colnames(textDF) <- c("fText","tweetN","date","groups")
tplot + geom_text(data = textDF,aes(label = fText, x=date,y=tweetN,colour=as.factor(groups))) + guides(colour=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment