Forked from dempseydata/Basic Tweet Tag Popularity Comparison
Created
June 26, 2014 11:18
-
-
Save Altons/ed11ae3107198876184e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Basic Twitter Analysis | |
# analyze and compare tweets for a defined set of hash and @ tags | |
# base code from "Data Science with R", a book for an introductory course from Syracuse, as per http://blog.revolutionanalytics.com/2013/02/free-e-book-on-data-science-with-r.html | |
# load the required packages: | |
require("bitops") | |
require("RCurl") | |
require("RJSONIO") | |
require("twitteR") | |
require("ROAuth") | |
require("stringr") # for string manipulation | |
require("plyr") # data frame manipulation | |
require("ggplot2") # plotting | |
require("tm") # for text mining | |
require("wordcloud") # for word cloud visual | |
############################ | |
# Task 1 : handshake and register with Twitter in order to access the API | |
# - you must have a registered API key in order to use the API, contained within a RData file - see code for notes | |
############################ | |
#create the credential variable | |
credential <- NULL | |
# check to see if there is a credential file present | |
if(file.exists("credential.RData")) | |
{ | |
load(file = "credential.RData") | |
} else { | |
if(file.exists("mykey.Rdata")) | |
{ | |
load(file = "mykey.RData") | |
# get the credentials etc | |
credential <- OAuthFactory$new(consumerKey=my.key, | |
consumerSecret="<YOUR secret goes here>", | |
requestURL="https://api.twitter.com/oauth/request_token", | |
accessURL="https://api.twitter.com/oauth/access_token", | |
authURL="https://api.twitter.com/oauth/authorize") | |
# handshake with Twitter in order to get the necessary confirmation pin | |
# THIS WILL LIKELY REQUIRE MANUAL URL ENTRY INTO A BROWSER - RStudio for example, will not allow copy and paste of the returned URL from the console | |
credential$handshake() | |
save(credential,file = "credential.RData") | |
} else { | |
stop("In order to run this program, you need to have an RData file called mykey.RData containing a single variable called my.key, containing, your twitterAPI key") | |
} | |
} | |
#Now that we have the credential, we register it for this session | |
if(registerTwitterOAuth(credential)) | |
{ | |
cat("Credential has been registered","\n") | |
} else { | |
stop("There is an issue with the credential file, I suggest you delete it and rerun the script in order to re-handshake") | |
} | |
############################ | |
# Task 2 : Access the API and download the raw data | |
############################ | |
# initiate the tags | |
my.tags <- NULL | |
# read the tags from a CSV file with two columns, but no column header | |
if(file.exists("twitter_tags.txt")) | |
{ | |
my.tags <- read.csv("twitter_tags.txt", header = FALSE, stringsAsFactors = FALSE, col.names = c("group","tag")) | |
} else { | |
stop("Cannot find the file twitter_tags.txt in working directory. This file needs to contain two or more lines in csv format of grouping,tag - no column headers - note: the grouping label should be the owner of the twitter account, such as a company, that you are wanting to group by and compare, but without the @") | |
} | |
# this function returns a data frame for each tweet search term requested | |
# limiting search to rolling 14 days finishing YESTERDAY (ie 14 complete days) | |
TweetFrame <- function(search.term, max.tweets){ | |
twt.list <- searchTwitter(search.term, n = max.tweets)#, since=as.character(Sys.Date() - 14), until=as.character(Sys.Date() - 1)) # get the list | |
a <- do.call("rbind",lapply(twt.list, as.data.frame)) # convert to a data frame | |
a <- a[order(as.integer(a$created)),] # order in increasing time stamp | |
a.delay <- as.integer(diff(a[,5])) # what is the delay between tweets | |
a.delay <- c(NA, a.delay) # add an NA record to represent the first tweet having no delay | |
a$delay <- as.data.frame(a.delay) # add new column to data frame | |
a$delay <- a.delay # add the delay numbers to records other than the first, which as NA delay | |
return(cbind(search.term, a)) | |
} | |
# go and fetch 500 tweets for each of the required tags, returning a list of data frames | |
the.tweets <- apply(my.tags,1,(function(x) TweetFrame(x[2],500))) | |
# Convert this list into a single data frame | |
the.tweets <- do.call("rbind",the.tweets) | |
# rename the first column | |
colnames(the.tweets)[1] <- "tag" | |
# merge with my.tags in order to add the groupibn column | |
the.tweets <- merge(my.tags, the.tweets, by = "tag") | |
########################## | |
# Task 3 : Add additional column for: how long? how many words? is it a retweet? How many retweets? Was it tweeted by the owner? was it a retweet of the owners tweet? Does it contain any tags? How many and which other tags from the list are included? | |
######################### | |
# length | |
the.tweets$length <- str_length(the.tweets$text) | |
# number of words, after removiung double spaces - #words = #spaces + 1 "dog ran fast" = 3 words, 2 spaces | |
the.tweets$wordCount <- str_count(str_replace_all(the.tweets$text," ","")," ") + 1 | |
# is it a retweet and if so how many times? | |
the.tweets$rtOf <- str_match(the.tweets$text,"RT @[a-z,A-Z,0-9]*:") # find the latest RT | |
the.tweets$rtOf <- str_replace(str_replace(the.tweets$rt,"RT @",""),":","") # trim RT @ and : | |
the.tweets$rtOfOwner <- toupper(the.tweets$rtOf) == toupper(the.tweets$group) | |
# the problem with this step below, is tha it only counts exact matches of the rwtweeter being equal to the group. eg. Sidecar = Sidecar | |
# it does not capture sub-twitter accounts, such that SidecarCHI != Sidecar (the CHI indicating it is the Chicago office of sidecar) | |
# So, here we UNDERCOUNT, but we also avoid over counting false twitter accounts such as 'SidecarSucks' or other negative accounts as if they are the vendor | |
the.tweets$rtByOwner <- (the.tweets$isRetweet) && (toupper(the.tweets$screenName) == toupper(the.tweets$group)) | |
# Need to add one column for each tag in the search list and flag for its inclusion in a specific tweet | |
# Will need to replace the # and @ symbols with text such as 'hash' and 'at' | |
tags.in.tweets <- as.data.frame(apply(my.tags,1,(function(x) !(is.na(str_match(the.tweets$text,x[2])))))) | |
# assign column names | |
colnames(tags.in.tweets) <- str_replace(str_replace(my.tags[,2],"#","hash"),"@","at") | |
# combine the two data frames | |
the.tweets <- cbind(the.tweets,tags.in.tweets) | |
# we no longer need the separate data frame of tags in tweets, tags, or the credentials vairable | |
rm("tags.in.tweets","my.tags","credential") | |
##### should I remove the retweets before counting the words? | |
########################## | |
# Task 4 : Analyze the data | |
# which tag has the shortest delay between tweets | |
# what is the relative popularity of the tags | |
# who has the most retweets | |
# who has more public versus host tweets | |
# what are the popular groupings of tags | |
######################### | |
# quickly grab the means of each tag | |
the.tweets.means <- ddply(the.tweets, ~group+tag, numcolwise(mean,na.rm=TRUE)) | |
# add on the number of tweets for each tag in the data set | |
the.tweets.means <- merge(the.tweets.means, ddply(the.tweets, ~group+tag, nrow), by=c("group","tag")) | |
colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount") | |
# But, what is the likelihood that the values we see in this data, are reasonable ones, so that we can be confident if one tag is more popular than the rest? Poisson test (PT)? | |
# 1 - PT each tag to look at the confidence interval - the narrower, the better | |
single.pt <- ddply(the.tweets, ~group+tag, function(x) { | |
a <- poisson.test(sum(x$delay <= mean(x$delay, na.rm=TRUE), na.rm=TRUE), nrow(x)) | |
# the % less than mean, lower bound confidence interval, higher bound confidence interval | |
b <- c(a["estimate"][[1]][[1]],a["conf.int"][[1]][[1]],a["conf.int"][[1]][[2]]) | |
}) | |
# merge and rename columns | |
the.tweets.means <- merge(the.tweets.means, single.pt, by=c("group","tag")) | |
colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount","estimate","lowerConfidence","upperConfidence") | |
# force a max Upperconfidence interval of 1 (100%) | |
the.tweets.means$upperConfidence[the.tweets.means$upperConfidence > 1] <- 1 | |
# 2 - Benchmark all tags against the one with the smallest mean delay, in terms of % of tweets quicker than the mean, then use the PT to check the confidence intervals. If a confidence interval overlaps with that of the 'quickest' tag, then there is a chance the quickest was not actually the quickest | |
multiple.pt <- ddply(the.tweets, ~group+tag, function(x) { | |
a.mean <- mean(x$delay, na.rm=TRUE) # the mean delay of the tag | |
a.num.delays <- sum(x$delay <= a.mean, na.rm=TRUE) # number of delays less than the mean of the tag | |
a.nrow <- nrow(x) # the number of rows of the tag | |
# now lets compare every tag, against these values | |
a.compare <- ddply(the.tweets, ~group+tag, function(y, b.mean=a.mean, b.num.delays = a.num.delays, b.row=a.nrow) { | |
b.test <- poisson.test(c(b.num.delays, # the tag being used as the baseline, from the outer ddply | |
sum(y$delay <= b.mean, na.rm=TRUE)), # number of delays of the tag being analyzed quicker than the delay of the tag from the outer ddply | |
c(b.row,nrow(y))) # nrow for the tage from the outer ddply and the inner ddply | |
b.result <- c(x$tag[1],b.test["estimate"][[1]][[1]],b.test["conf.int"][[1]][[1]],b.test["conf.int"][[1]][[2]]) | |
}) | |
}) | |
# rename the columns | |
colnames(multiple.pt) <- c("group","tag","compareTag","compareEst","compareLowerConf","compareUpperConf") | |
# for some reason, the comparison columns are set to character, when they should be numeric | |
multiple.pt$compareEst <- as.numeric(multiple.pt$compareEst) | |
multiple.pt$compareLowerConf <- as.numeric(multiple.pt$compareLowerConf) | |
multiple.pt$compareUpperConf <- as.numeric(multiple.pt$compareUpperConf) | |
# this gets me the comparisons of all tags against all others, but in reality, all I want is the comparison of very tag against the tag with the lowest mean delay. So, lets pull this one out of the data fram and them add the columns into the means data frame | |
the.tweets.means <- merge(the.tweets.means, | |
# pull out the records from the multiple.pt results, where the compare tag is that with the lowest delay | |
multiple.pt[multiple.pt$compareTag == the.tweets.means$tag[the.tweets.means$delay == min(the.tweets.means$delay)],], | |
by=c("group","tag")) | |
# reorder to dataframe for plotting by ascending mean delay | |
the.tweets.means <- the.tweets.means[with(the.tweets.means, order(delay)),] | |
# reorder the factor for tag to reflect the ascending mean delay | |
the.tweets.means$tag <- factor(the.tweets.means$tag, as.character(the.tweets.means[order(the.tweets.means$delay),]$tag),ordered = TRUE) | |
# plot histogram of tweet delays and the mean delay | |
hist.mean.plot <- ggplot(the.tweets.means) + # base of plot, using means to ensure ascendign mean ordering | |
geom_vline(aes(xintercept=delay), color="red") + # what are the mean delays for each tag, NA is removed for the first tweet of each tag has NA delay | |
geom_histogram(data=the.tweets, aes(x=delay), binwidth=60, colour="black", fill="white") + # main histogram bin width in minutes | |
facet_wrap(~tag+group, ncol=1) + # layout by tag and then group - note the sorting is done in the order the facet is specified | |
scale_x_continuous(limits=c(0,3600),name="Delay in seconds (1 hour limit)") + # limit histogram to 12 hours | |
scale_y_continuous(name="# Tweets") + # relable the Y | |
ggtitle(expression(atop("Histogram of delays between tweets", atop(italic("Red bar indicated mean delay for tag"), "")))) | |
print(hist.mean.plot) | |
# plot the % of tweets arriving before the mean delay, and the confidence interval based on an ideal poisson distro | |
conf.int.plot <- ggplot(the.tweets.means, aes(x=tag)) + | |
geom_point(aes(y=(estimate*100)), size=3) + | |
geom_errorbar(aes(ymin=(lowerConfidence*100), ymax=(upperConfidence*100)), width=.1) + | |
scale_y_continuous(limits=c(0,100), name="% Tweets quicker than the mean") + | |
scale_x_discrete("Tag") + | |
ggtitle(expression(atop("% of tweets quicker than the mean delay", atop(italic("95% confidence interval"), "")))) | |
print(conf.int.plot) | |
# plot to relative speeds of the tweets when compared to the tag with the smallest mean, and the confidence intervals | |
upperSpeedLimit <- the.tweets.means$compareUpperConf[the.tweets.means$tag == the.tweets.means$compareTag] # find the upper confidence limit of the fastest tag | |
rel.slowness.plot <- ggplot(the.tweets.means, aes(x=tag)) + # using the means data | |
geom_point(aes(y=compareEst), size=3) + # plot the estimated N times slower | |
geom_errorbar(aes(ymin=compareLowerConf, ymax=compareUpperConf), width=.1) + # plot the confidence interval | |
geom_hline(aes(yintercept=upperSpeedLimit), color="red", width = .1, linetype=2) + # plot the reference line for 1, being the same speed as the fastest tag | |
scale_y_continuous(name="N times") + # label Y | |
scale_x_discrete("Tag") + # label X | |
ggtitle(expression(atop("How slow are the tags compared to the fastest?", atop(italic("95% confidence interval"), "")))) | |
print(rel.slowness.plot) | |
####################### | |
# Task 5 - Text analysis of tweets | |
####################### | |
# this function removes all URLs retweets, hashtags and user names, leaving plain words only | |
TweetsToCleanedWordFrequecy <- function(tweets) | |
{ | |
# first, lets clean all the tweets up | |
tweets$text <- str_replace_all(tweets$text,"http:[./a-zA-Z0-9]*","") # remove URLs from end or first space, assuming alphanumeric shortening | |
tweets$text <- str_replace_all(tweets$text,"https:[./a-zA-Z0-9]*","") # remove URLs | |
tweets$text <- str_replace_all(tweets$text,"RT @[a-zA-Z0-9]*","") # remove all retweet labels | |
tweets$text <- str_replace_all(tweets$text,"MT @[a-zA-Z0-9]*","") # remove all modified tweet labels | |
tweets$text <- str_replace_all(tweets$text,"#[a-zA-Z0-9]*","") # remove all hashtags | |
tweets$text <- str_replace_all(tweets$text,"@[a-zA-Z0-9]*","") # remove all screen names | |
tweets$text = str_replace_all(tweets$text,"[^[:alpha:] ]", "") # Only keep alpha - removes all non-western and strange unicode characters | |
tweets$text <- str_replace_all(tweets$text," {2,}"," ") # replace all multiple spaces with a single space | |
# now lets use the tm package to do some text mining | |
tweet.corpus <- Corpus(VectorSource(tweets$text)) # covert to corpus object | |
tweet.corpus <- tm_map(tweet.corpus, removeWords, stopwords('english')) # remove stop words such as "the" "a" "at" etc | |
tweet.corpus <- tm_map(tweet.corpus, tolower) # force everything to lower case | |
# In many cases, words need to be stemmed to retrieve their radicals. For instance, "example" and "examples" are both stemmed to "exampl". However, after that, one may want to complete the stems to their original forms, so that the words would look "normal" - this basically consolidates the various forms of a word together (thank + thanks + thanked etc) | |
tweet.corpus.dict <- tweet.corpus | |
tweet.corpus <- tm_map(tweet.corpus, stemDocument) | |
tweet.corpus <- tm_map(tweet.corpus, stemCompletion, dictionary=tweet.corpus.dict) #, mc.cores=4) | |
# increasing the cores used for stemp compeletion can speed things up, in my example case, from 3:37 down to 2:43 | |
# we now have a bag of words, time to create a term-document-matrix | |
tweet.tdm <- TermDocumentMatrix(tweet.corpus) | |
td.matrix <- as.matrix(tweet.tdm) # convert to matrix | |
td.matrix <- sort(rowSums(td.matrix), decreasing=TRUE) # sort the matrix | |
word.freq.frame <- data.frame(word=names(td.matrix),freq=td.matrix) # convert to a data frame for visualization | |
return(word.freq.frame) | |
} | |
# generate a seperate word frequency count for each tag | |
word.freq.frame <- ddply(the.tweets[,1:3], ~group+tag, TweetsToCleanedWordFrequecy) | |
# in order to plot multiple wordclouds on a single page, they must first be generated at PNG files and saved to the working directory | |
pal <- brewer.pal(9,"GnBu")[-(1:2)] # generate a color pallete, but loose the palest shades | |
old.par <- par(no.readonly=TRUE) # store current params before altering | |
par(mfrow = c(ceiling(nrow(the.tweets.means)/3),3), # set to 3 columns wide and how ever many rows | |
oma=c(0, 0, 2, 0)) # ensure there is room at the top for an overall title | |
d_ply(word.freq.frame, ~group+tag, function(x=word.freq.frame){ | |
wordcloud(x$word, x$freq, max.words=50, colors=pal) | |
title(max(str_c(x$group," - ",x$tag))) | |
}) | |
par(font.main=2, ps=30) # set font style for main title only | |
title(main="Twitter Word Clouds", outer = TRUE) | |
par <- old.par # reset the params |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment