Last active
August 29, 2015 13:57
-
-
Save emhart/9399250 to your computer and use it in GitHub Desktop.
Create twitter analytics for social media, an intro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(twitteR) | |
library(RCurl) | |
library(plyr) | |
library(tm) | |
library(wordcloud) | |
library(ggplot2) | |
### Great now let's figure out a way to highlight rapid acceleration of tweets | |
### Let's write a function that will take a vector of posix dates, a time threshold (in seconds), and assign them indexes | |
### dates: a vector of POSIXct dates | |
### thresh: a minimum span of time to consider it a run on tweets. Default is 100 seconds | |
### blockSize: The minimum number of tweets in a row to consider in a run of rapid tweets, Default is 10 | |
rapidTweets <- function(dates, thresh = 100, blockSize = 10){ | |
index <- rep(0,length(dates)) | |
group <- 1 | |
indexTemp <- vector() | |
timeDiffs <- abs(diff(dates)) | |
for(i in 1:(length(timeDiffs)-1)){ | |
if(timeDiffs[i] <= thresh && timeDiffs[i+1] <= thresh){ | |
indexTemp <- c(indexTemp,i) | |
} | |
if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) >= blockSize){ | |
indexTemp <- c(indexTemp,i) | |
index[indexTemp] <- group | |
indexTemp <- vector() | |
group <- group + 1 | |
} | |
if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) < blockSize){ | |
indexTemp <- vector() | |
} | |
} | |
return(index) | |
} | |
#### Standard word cloud code, I've made it a function just to make life a bit easier. | |
### textVector: a character vector | |
### myStopwords: some extra stopwords you might want | |
### returnTDM: if True return the TDM so you can see frequent words | |
createWC <- function(textVector,myStopwords, returnTDM = FALSE){ | |
### This is pretty boiler plate stuff | |
# Create a corpus of words | |
myCorpus <- Corpus(VectorSource(textVector)) | |
# Here we'll lowercase everything, strip punctionation, and remove stop words | |
myCorpus <- tm_map(myCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte')) | |
myCorpus <- tm_map(myCorpus, tolower) | |
myCorpus <- tm_map(myCorpus, removePunctuation) | |
myCorpus <- tm_map(myCorpus, removeNumbers) | |
### Create stopwords list and strip them out | |
myStopwords <- c(stopwords('english'),myStopwords) | |
myCorpus <- tm_map(myCorpus, removeWords, myStopwords) | |
### Stem words | |
dictCorpus <- myCorpus | |
myCorpus <- tm_map(myCorpus, stemDocument) | |
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) | |
### Next we create out term document matrix | |
mytdm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 2)) | |
m <- as.matrix(mytdm) | |
v <- sort(rowSums(m), decreasing=TRUE) | |
myNames <- names(v) | |
d <- data.frame(word=myNames, freq=v) | |
pal <- colorRampPalette(c("red","blue"))(10) | |
wordcloud(d$word, d$freq, min.freq=3,colors=pal,random.order=FALSE) | |
if(returnTDM){ | |
return(mytdm) | |
} | |
} | |
# Set SSL certs globally | |
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))) | |
#### This is all the code you want to authorize with the twitter API and then grab tweets! | |
reqURL <- "https://api.twitter.com/oauth/request_token" | |
accessURL <- "https://api.twitter.com/oauth/access_token" | |
authURL <- "https://api.twitter.com/oauth/authorize" | |
consumerKey <- "YOUR_KEY" | |
consumerSecret <- "YOUR_SECRET" | |
twitCred <- OAuthFactory$new(consumerKey=consumerKey,consumerSecret=consumerSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL) | |
### Cover your bases if you don't have certificates installed | |
twitCred$handshake() | |
registerTwitterOAuth(twitCred) | |
scioSearch <- searchTwitter(searchString = "@naturalsciences", n = 1000) | |
###### Get everything into dataframes | |
fullText <- ldply(scioSearch, function(x) return(x$text)) | |
retweets <- ldply(scioSearch, function(x) return(x$retweetCount)) | |
## Grab | |
wasRetweet <- ldply(scioSearch, function(x) !is.na(grep("RT",x$text)[1])) | |
### View the fraction of retweets | |
print(sum(was_retweet)/length(scioSearch)) | |
### We could also see the rate of tweeting | |
# Extract user names | |
screenName <- ldply(scioSearch, function(x) return(x$screenName)) | |
### Extract time stamps | |
dates <- ldply(scioSearch, function(x) return(as.POSIXct(x$created))) | |
### Create tweet date data frame | |
tweetDF <- data.frame(screenName,dates,fullText) | |
### Add accumulation | |
tweetDF$tweetN <- dim(tweetDF)[1]:1 | |
tweetDF$diffs <- rapidTweets(dates$V1,thresh=2500) | |
colnames(tweetDF) <- c("screenName","date","fullText","tweetN","groups") | |
ggplot(tweetDF,aes(x=date,y=tweetN))+geom_point() theme_bw(20) + xlab("Date") + ylab("Cumulative tweet number") | |
twitStopwords <- c("http","https","t.co","co","naturalsciences","amp","today","\n","diff","one","talk","museum") | |
textVect <- paste(tweetDF$fullText,collapse=" ") | |
createWC(textVect,twitStopwords) | |
### Now let's subset by our data by runs of tweets, let's look at group 1 | |
g1 <- paste(tweetDF$fullText[tweetDF$group==1],collapse=" ") | |
outTDM <- createWC(g1,twitStopwords,returnTDM=T) | |
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE) | |
fwordsg1 <- paste(names(freqWords[1:4]),collapse = ", ") | |
### Now let's subset by our data by runs of tweets, let's look at group 2 | |
g2 <- paste(tweetDF$fullText[tweetDF$group==2],collapse=" ") | |
outTDM <- createWC(g2,twitStopwords,returnTDM=T) | |
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE) | |
fwordsg2 <-paste(names(freqWords[1:4]),collapse = ", ") | |
### Now let's subset by our data by runs of tweets, let's look at group 3 | |
g3 <- paste(tweetDF$fullText[tweetDF$group==3],collapse=" ") | |
outTDM <- createWC(g3,twitStopwords,returnTDM=T) | |
freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE) | |
fwordsg3 <- paste(names(freqWords[1:4]),collapse = ", ") | |
tplot <- ggplot(tweetDF,aes(x=date,y=tweetN,colour = as.factor(groups)))+geom_point() +theme_bw(20) + scale_colour_manual("Rapid Tweet \n groups",values = c("black","red","blue","orange")) + xlab("Date") +ylab("Cumulative tweet number") | |
### Assemble data frame for text plots | |
### offset the dates for plotting, and do this out by hand because sapply doesn't return posix dates | |
textTweetN <- sapply(1:3, function(x) median(tweetDF$tweetN[tweetDF$group==x])) | |
### Play around with this to get it right | |
offs <- 100000 | |
textTweetDate <- c(median(tweetDF$date[tweetDF$group==1]) + offs ,median(tweetDF$date[tweetDF$group==2]) + offs,median(tweetDF$date[tweetDF$group==3]) + offs) | |
textDF <- data.frame(c(fwordsg1,fwordsg2,fwordsg3),textTweetN,textTweetDate,1:3) | |
colnames(textDF) <- c("fText","tweetN","date","groups") | |
tplot + geom_text(data = textDF,aes(label = fText, x=date,y=tweetN,colour=as.factor(groups))) + guides(colour=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment