emhart · August 29, 2015 13:57
diff --git a/naturalsciencesTwitter.R b/naturalsciencesTwitter.R
 library(twitteR)
 library(RCurl)
 library(plyr)
 library(tm)
 library(wordcloud)
 library(ggplot2)


 ### Great now let's figure out a way to highlight rapid acceleration of tweets
 ### Let's write a function that will take a vector of posix dates, a time threshold (in seconds), and assign them indexes
 ### dates: a vector of POSIXct dates
 ### thresh: a minimum span of time to consider it a run on tweets.  Default is 100 seconds
 ### blockSize: The minimum number of tweets in a row to consider in a run of rapid tweets, Default is 10

 rapidTweets <- function(dates, thresh = 100, blockSize = 10){
  index <- rep(0,length(dates))
  group <- 1
  indexTemp <- vector()
  timeDiffs <- abs(diff(dates))
  for(i in 1:(length(timeDiffs)-1)){
    if(timeDiffs[i] <= thresh && timeDiffs[i+1] <= thresh){
      indexTemp <- c(indexTemp,i)
    }
    if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) >= blockSize){
      indexTemp <- c(indexTemp,i)
      index[indexTemp] <- group
      indexTemp <- vector()
      group <- group + 1
    }   
    if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) < blockSize){
      indexTemp <- vector()
    }
    
  }
  return(index)
 }



 #### Standard word cloud code, I've made it a function just to make life a bit easier.  
 ### textVector: a character vector
 ### myStopwords: some extra stopwords you might want
 ### returnTDM: if True return the TDM so you can see frequent words

 createWC <- function(textVector,myStopwords, returnTDM = FALSE){
  ###  This is pretty boiler plate stuff
  # Create a corpus of words
  myCorpus <- Corpus(VectorSource(textVector))
  
  # Here we'll lowercase everything, strip punctionation, and remove stop words
  
  myCorpus <- tm_map(myCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
  myCorpus <- tm_map(myCorpus, tolower)
  myCorpus <- tm_map(myCorpus, removePunctuation)
  myCorpus <- tm_map(myCorpus, removeNumbers)
  
  ### Create stopwords list and strip them out
  myStopwords <- c(stopwords('english'),myStopwords)
  myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
  
  
  ### Stem words
  
  dictCorpus <- myCorpus
  myCorpus <- tm_map(myCorpus, stemDocument)
  myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
  ### Next we create out term document matrix
  mytdm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 2))
  
  
  
  
  
  m <- as.matrix(mytdm)
  
  v <- sort(rowSums(m), decreasing=TRUE)
  myNames <- names(v)
  d <- data.frame(word=myNames, freq=v)
  pal <- colorRampPalette(c("red","blue"))(10)
  wordcloud(d$word, d$freq, min.freq=3,colors=pal,random.order=FALSE)
  
  if(returnTDM){
    return(mytdm)
  }
  
 }

 # Set SSL certs globally
 options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))


 #### This is all the code you want to authorize with the twitter API and then grab tweets!

 reqURL <- "https://api.twitter.com/oauth/request_token"
 accessURL <- "https://api.twitter.com/oauth/access_token"
 authURL <- "https://api.twitter.com/oauth/authorize"


 consumerKey <- "YOUR_KEY"
 consumerSecret <- "YOUR_SECRET"

 twitCred <- OAuthFactory$new(consumerKey=consumerKey,consumerSecret=consumerSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL)

 ### Cover your bases if you don't have certificates installed

 twitCred$handshake()

 registerTwitterOAuth(twitCred)

  
  
 scioSearch <- searchTwitter(searchString = "@naturalsciences", n = 1000)
  
 ###### Get everything into dataframes
  
 fullText <- ldply(scioSearch, function(x) return(x$text))
 retweets <- ldply(scioSearch, function(x) return(x$retweetCount))
 ## Grab
 wasRetweet <- ldply(scioSearch, function(x) !is.na(grep("RT",x$text)[1]))

  
 ### View the fraction of retweets
 print(sum(was_retweet)/length(scioSearch))

 ### We could also see the rate of tweeting

 # Extract user names
 screenName <- ldply(scioSearch, function(x) return(x$screenName))
 ### Extract time stamps
 dates <- ldply(scioSearch, function(x) return(as.POSIXct(x$created)))

 ### Create tweet date data frame

 tweetDF <- data.frame(screenName,dates,fullText)
 ### Add accumulation
 tweetDF$tweetN <- dim(tweetDF)[1]:1
 tweetDF$diffs <- rapidTweets(dates$V1,thresh=2500)
 colnames(tweetDF) <- c("screenName","date","fullText","tweetN","groups")

 ggplot(tweetDF,aes(x=date,y=tweetN))+geom_point() theme_bw(20) + xlab("Date") + ylab("Cumulative tweet number")





 twitStopwords <- c("http","https","t.co","co","naturalsciences","amp","today","\n","diff","one","talk","museum")
 textVect <- paste(tweetDF$fullText,collapse=" ")

 createWC(textVect,twitStopwords)

 ### Now let's subset by our data by runs of tweets, let's look at group 1

 g1  <- paste(tweetDF$fullText[tweetDF$group==1],collapse=" ")

 outTDM <- createWC(g1,twitStopwords,returnTDM=T)

 freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
 fwordsg1 <- paste(names(freqWords[1:4]),collapse = ", ")


 ### Now let's subset by our data by runs of tweets, let's look at group 2

 g2  <- paste(tweetDF$fullText[tweetDF$group==2],collapse=" ")

 outTDM <- createWC(g2,twitStopwords,returnTDM=T)

 freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
 fwordsg2 <-paste(names(freqWords[1:4]),collapse = ", ")

 ### Now let's subset by our data by runs of tweets, let's look at group 3

 g3  <- paste(tweetDF$fullText[tweetDF$group==3],collapse=" ")

 outTDM <- createWC(g3,twitStopwords,returnTDM=T)

 freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
 fwordsg3 <- paste(names(freqWords[1:4]),collapse = ", ")


 tplot <- ggplot(tweetDF,aes(x=date,y=tweetN,colour = as.factor(groups)))+geom_point() +theme_bw(20) + scale_colour_manual("Rapid Tweet \n groups",values = c("black","red","blue","orange")) + xlab("Date") +ylab("Cumulative tweet number")


 ### Assemble data frame for text plots
 ### offset the dates for plotting, and do this out by hand because sapply doesn't return posix dates
 textTweetN <- sapply(1:3, function(x) median(tweetDF$tweetN[tweetDF$group==x]))
 ### Play around with this to get it right
 offs <- 100000
 textTweetDate <- c(median(tweetDF$date[tweetDF$group==1]) + offs ,median(tweetDF$date[tweetDF$group==2]) + offs,median(tweetDF$date[tweetDF$group==3]) + offs)

 textDF <- data.frame(c(fwordsg1,fwordsg2,fwordsg3),textTweetN,textTweetDate,1:3)
 colnames(textDF) <- c("fText","tweetN","date","groups")
 tplot + geom_text(data = textDF,aes(label = fText, x=date,y=tweetN,colour=as.factor(groups))) + guides(colour=FALSE)
	library(twitteR)
	library(RCurl)
	library(plyr)
	library(tm)
	library(wordcloud)
	library(ggplot2)


	### Great now let's figure out a way to highlight rapid acceleration of tweets
	### Let's write a function that will take a vector of posix dates, a time threshold (in seconds), and assign them indexes
	### dates: a vector of POSIXct dates
	### thresh: a minimum span of time to consider it a run on tweets. Default is 100 seconds
	### blockSize: The minimum number of tweets in a row to consider in a run of rapid tweets, Default is 10

	rapidTweets <- function(dates, thresh = 100, blockSize = 10){
	index <- rep(0,length(dates))
	group <- 1
	indexTemp <- vector()
	timeDiffs <- abs(diff(dates))
	for(i in 1:(length(timeDiffs)-1)){
	if(timeDiffs[i] <= thresh && timeDiffs[i+1] <= thresh){
	indexTemp <- c(indexTemp,i)
	}
	if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) >= blockSize){
	indexTemp <- c(indexTemp,i)
	index[indexTemp] <- group
	indexTemp <- vector()
	group <- group + 1
	}
	if(timeDiffs[i] <= thresh && timeDiffs[i+1] > thresh && length(indexTemp) < blockSize){
	indexTemp <- vector()
	}

	}
	return(index)
	}



	#### Standard word cloud code, I've made it a function just to make life a bit easier.
	### textVector: a character vector
	### myStopwords: some extra stopwords you might want
	### returnTDM: if True return the TDM so you can see frequent words

	createWC <- function(textVector,myStopwords, returnTDM = FALSE){
	### This is pretty boiler plate stuff
	# Create a corpus of words
	myCorpus <- Corpus(VectorSource(textVector))

	# Here we'll lowercase everything, strip punctionation, and remove stop words

	myCorpus <- tm_map(myCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
	myCorpus <- tm_map(myCorpus, tolower)
	myCorpus <- tm_map(myCorpus, removePunctuation)
	myCorpus <- tm_map(myCorpus, removeNumbers)

	### Create stopwords list and strip them out
	myStopwords <- c(stopwords('english'),myStopwords)
	myCorpus <- tm_map(myCorpus, removeWords, myStopwords)


	### Stem words

	dictCorpus <- myCorpus
	myCorpus <- tm_map(myCorpus, stemDocument)
	myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
	### Next we create out term document matrix
	mytdm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 2))





	m <- as.matrix(mytdm)

	v <- sort(rowSums(m), decreasing=TRUE)
	myNames <- names(v)
	d <- data.frame(word=myNames, freq=v)
	pal <- colorRampPalette(c("red","blue"))(10)
	wordcloud(d$word, d$freq, min.freq=3,colors=pal,random.order=FALSE)

	if(returnTDM){
	return(mytdm)
	}

	}

	# Set SSL certs globally
	options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))


	#### This is all the code you want to authorize with the twitter API and then grab tweets!

	reqURL <- "https://api.twitter.com/oauth/request_token"
	accessURL <- "https://api.twitter.com/oauth/access_token"
	authURL <- "https://api.twitter.com/oauth/authorize"


	consumerKey <- "YOUR_KEY"
	consumerSecret <- "YOUR_SECRET"

	twitCred <- OAuthFactory$new(consumerKey=consumerKey,consumerSecret=consumerSecret,requestURL=reqURL,accessURL=accessURL,authURL=authURL)

	### Cover your bases if you don't have certificates installed

	twitCred$handshake()

	registerTwitterOAuth(twitCred)



	scioSearch <- searchTwitter(searchString = "@naturalsciences", n = 1000)

	###### Get everything into dataframes

	fullText <- ldply(scioSearch, function(x) return(x$text))
	retweets <- ldply(scioSearch, function(x) return(x$retweetCount))
	## Grab
	wasRetweet <- ldply(scioSearch, function(x) !is.na(grep("RT",x$text)[1]))


	### View the fraction of retweets
	print(sum(was_retweet)/length(scioSearch))

	### We could also see the rate of tweeting

	# Extract user names
	screenName <- ldply(scioSearch, function(x) return(x$screenName))
	### Extract time stamps
	dates <- ldply(scioSearch, function(x) return(as.POSIXct(x$created)))

	### Create tweet date data frame

	tweetDF <- data.frame(screenName,dates,fullText)
	### Add accumulation
	tweetDF$tweetN <- dim(tweetDF)[1]:1
	tweetDF$diffs <- rapidTweets(dates$V1,thresh=2500)
	colnames(tweetDF) <- c("screenName","date","fullText","tweetN","groups")

	ggplot(tweetDF,aes(x=date,y=tweetN))+geom_point() theme_bw(20) + xlab("Date") + ylab("Cumulative tweet number")





	twitStopwords <- c("http","https","t.co","co","naturalsciences","amp","today","\n","diff","one","talk","museum")
	textVect <- paste(tweetDF$fullText,collapse=" ")

	createWC(textVect,twitStopwords)

	### Now let's subset by our data by runs of tweets, let's look at group 1

	g1 <- paste(tweetDF$fullText[tweetDF$group==1],collapse=" ")

	outTDM <- createWC(g1,twitStopwords,returnTDM=T)

	freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
	fwordsg1 <- paste(names(freqWords[1:4]),collapse = ", ")


	### Now let's subset by our data by runs of tweets, let's look at group 2

	g2 <- paste(tweetDF$fullText[tweetDF$group==2],collapse=" ")

	outTDM <- createWC(g2,twitStopwords,returnTDM=T)

	freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
	fwordsg2 <-paste(names(freqWords[1:4]),collapse = ", ")

	### Now let's subset by our data by runs of tweets, let's look at group 3

	g3 <- paste(tweetDF$fullText[tweetDF$group==3],collapse=" ")

	outTDM <- createWC(g3,twitStopwords,returnTDM=T)

	freqWords <- sort(rowSums(as.matrix(outTDM)), decreasing=TRUE)
	fwordsg3 <- paste(names(freqWords[1:4]),collapse = ", ")


	tplot <- ggplot(tweetDF,aes(x=date,y=tweetN,colour = as.factor(groups)))+geom_point() +theme_bw(20) + scale_colour_manual("Rapid Tweet \n groups",values = c("black","red","blue","orange")) + xlab("Date") +ylab("Cumulative tweet number")


	### Assemble data frame for text plots
	### offset the dates for plotting, and do this out by hand because sapply doesn't return posix dates
	textTweetN <- sapply(1:3, function(x) median(tweetDF$tweetN[tweetDF$group==x]))
	### Play around with this to get it right
	offs <- 100000
	textTweetDate <- c(median(tweetDF$date[tweetDF$group==1]) + offs ,median(tweetDF$date[tweetDF$group==2]) + offs,median(tweetDF$date[tweetDF$group==3]) + offs)

	textDF <- data.frame(c(fwordsg1,fwordsg2,fwordsg3),textTweetN,textTweetDate,1:3)
	colnames(textDF) <- c("fText","tweetN","date","groups")
	tplot + geom_text(data = textDF,aes(label = fText, x=date,y=tweetN,colour=as.factor(groups))) + guides(colour=FALSE)