Created
July 15, 2014 11:10
-
-
Save ttunguz/7ba8ec7e2588736e86f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(reshape) | |
library(scales) | |
### INSTRUCTIONS | |
# 1. Save file to same local directory | |
# 2. Change time zone specification in third IMPORT DATA statement | |
# 3. In each top_n_for_hour or top_n_for_day, test to statistical significance by comparing the result of the top_n function and the result of the matrix t test | |
#IMPORT DATA | |
data = read.csv("tweet_activity_metrics.csv") | |
data$time = as.POSIXct(data$time, tz="Europe/London") | |
data$time = format(data$time, tz="America/Los_Angeles") | |
data$hour = as.POSIXlt(strftime(data$time, format="%H:%M"), format="%H:%M")$hour | |
data = subset(data, hour>4) | |
data = subset(data, hour<21) | |
## BASICS | |
## GENERAL PLOTS FOR HOUR | |
ggplot(data) + geom_point(aes(data$hour, data$engagement.rate), size=5, alpha=0.7, colour="red", position="jitter") + xlab("") + ylab("Engagement Rate") + ggtitle("Engagement Rate by Time of Day")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8) + scale_y_continuous( labels = percent_format()) | |
ggsave("era_by_tod.png", dpi=300, width=9, height=6) | |
### TIME OF DAY FUNCTIONS | |
top_n_for_hour = function(df, n){ | |
ordered = df[order(df$average, decreasing=TRUE),][1:n,] | |
return (ordered) | |
} | |
time_of_day_t = function(df, field, top_hours){ | |
x = data.frame(matrix(NA, nrow = 25, ncol = 25)) | |
colnames(x) = seq(1:25) | |
rownames(x) = seq(1:25) | |
for (i in 5:20){ | |
current_hour = subset(df, df$hour == i) | |
for (j in 5:20){ | |
testing_hour = subset(df, df$hour == j) | |
if (is.data.frame(testing_hour) && nrow(testing_hour) > 1 && nrow(current_hour)>1){ | |
x[i, j] = t.test(current_hour[field], testing_hour[field])$p.value | |
} | |
else { | |
x[i, j] = NA | |
} | |
} | |
} | |
x$hour1 = factor(rownames(x)) | |
x = melt(x) | |
x = subset(x, value < 0.05) | |
x = subset(x, hour1 %in% top_hours) | |
x = subset(x, variable %in% top_hours) | |
return (x) | |
} | |
designate_top = function(hourly_data, top_hours){ | |
hourly_data$top = 0 | |
hourly_data$top[hourly_data$hour %in% top_hours]=1 | |
return (hourly_data) | |
} | |
## RETWEETS BY HOUR | |
rt_by_hour = ddply(data, .(hour), summarise, average=mean(retweets), count_posts = length(Tweet.id)) | |
top_retweet_hours = top_n_for_hour(rt_by_hour, 3)$hour | |
rt_by_hour = designate_top(rt_by_hour, top_retweet_hours) | |
retweet_t = time_of_day_t(data, "retweets", top_retweet_hours) | |
## 8 and 9 are the best hours | |
ggplot(rt_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Retweets per Post") + ggtitle("Best Time of Day to Maximize RT")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ theme(legend.position="none") | |
ggsave("time_of_day_max_rt.png", dpi=300, width=12, height=9) | |
## IMPRESSIONS BY HOUR | |
imp_by_hour = ddply(data, .(hour), summarise, average = mean(impressions), count = length(Tweet.id)) | |
top_imp_hours = top_n_for_hour(imp_by_hour, 5)$hour | |
imp_by_hour = designate_top(imp_by_hour, top_imp_hours) | |
imp_t = time_of_day_t(data, "impressions", top_imp_hours) | |
ggplot(imp_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Impressions") + ggtitle("Best Time of Day to Maximize Impressions")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") +annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ theme(legend.position="none") | |
ggsave("time_of_day_max_imp.png", dpi=300, width=12, height=9) | |
## ENGAGEMENT RATE BY HOUR | |
er_by_hour = ddply(data, .(hour), summarise, average = mean(engagement.rate)) | |
top_er_hours = top_n_for_hour(er_by_hour, 11)$hour | |
er_by_hour = designate_top(er_by_hour, top_er_hours) | |
er_t = time_of_day_t(data, "engagement.rate", top_er_hours) | |
ggplot(er_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Engagement Rate") + ggtitle("Best Time of Day to Maximize Engagement Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format()) | |
ggsave("er_by_tod.png", dpi=300, width=12, height=9) | |
## URL CLICKS BY HOUR | |
data$url.ctr = data$url.clicks/data$impressions | |
url_by_hour = ddply(data, .(hour), summarise, average = sum(url.clicks)/sum(impressions),url.ctr = sum(url.clicks)/sum(impressions)) | |
top_url_hours = top_n_for_hour(url_by_hour, 4)$hour | |
url_by_hour = designate_top(url_by_hour, top_url_hours) | |
url_t = time_of_day_t(data, "url.ctr", top_url_hours) | |
ggplot(url_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Click Rate") + ggtitle("Best Time of Day to Maximize Click Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format()) | |
ggsave("url_click_by_tod.png", dpi=300, width=12, height=9) | |
## FOLLOWS BY HOUR | |
follows_by_hour = ddply(data, .(hour), summarise, average = mean(follows)) | |
top_follows_hours = top_n_for_hour(follows_by_hour, 4)$hour | |
follows_by_hour = designate_top(follows_by_hour, top_follows_hours) | |
follows_t = time_of_day_t(data, "follows", top_follows_hours) | |
ggplot(follows_by_hour) + geom_bar(aes(hour, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Follows") + ggtitle("Best Time of Day to New Follows")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format()) | |
ggsave("follow_rate_by_tod.png", dpi=300, width=12, height=9) | |
### DAY OF WEEK ANALYSIS | |
## DAY OF WEEK FUNCTIONS | |
data$day = weekdays(as.Date(data$time)) | |
data$weekday = as.POSIXlt(data$time)$wday | |
weekday_t = function(df, field, top_days){ | |
x = data.frame(matrix(NA, nrow = 7, ncol = 7)) | |
colnames(x) = seq(0:6) | |
rownames(x) = seq(0:6) | |
for (i in 1:7){ | |
current_day = subset(df, df$weekday == i-1) | |
for (j in 1:7){ | |
testing_day = subset(df, df$day == j-1) | |
if (is.data.frame(testing_day) && nrow(testing_day) > 1 && nrow(current_day)>1){ | |
x[i, j] = t.test(current_day[field], testing_day[field])$p.value | |
} | |
else { | |
x[i, j] = NA | |
} | |
} | |
} | |
x$day = factor(rownames(x)) | |
x = melt(x) | |
x = subset(x, value < 0.05) | |
x = subset(x, day %in% top_days) | |
x = subset(x, variable %in% top_days) | |
return (x) | |
} | |
top_n_for_day = function(df, n){ | |
ordered = df[order(df$average, decreasing=TRUE),][1:n,] | |
return (ordered) | |
} | |
## PLOT HIGH LEVEL DATA | |
ggplot(data) + geom_point(aes(data$weekday, data$engagement.rate), size=5, alpha=0.7, colour="dodgerblue", position="jitter") + xlab("") + ylab("Engagement Rate") + ggtitle("Engagement Rate by Day of Week")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8) + scale_y_continuous( labels = percent_format()) | |
ggplot(data) + geom_boxplot(aes(data$weekday, data$impressions,outlier.color="gray", group=data$weekday), fill="orange", colour="gray", outlier.colour="gray50", outlier.size=3) + xlab("") + ylab("Impressions") + ggtitle("Impressions by Day of Week")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + scale_colour_manual(values = c("red", "dodgerblue4")) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_y_continuous( labels = comma_format()) | |
## URL CLICKS BY DAY | |
data$url.ctr = data$url.clicks/data$impressions | |
url_by_day = ddply(data, .(weekday), summarise, average = sum(url.clicks)/sum(impressions),url.ctr = sum(url.clicks)/sum(impressions), count = length(url.clicks)) | |
top_url_day = top_n_for_day(url_by_day, 7)$weekday | |
url_by_day = designate_top(url_by_day, top_url_day) | |
day_url_t = weekday_t(data, "url.ctr", top_url_day) | |
ggplot(url_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Click Rate") + ggtitle("Best Day of Week to Maximize Click Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format()) | |
## ENGAGEMENT RATE BY DAY | |
er_by_day = ddply(data, .(weekday), summarise, average = mean(engagement.rate)) | |
top_er_day = top_n_for_day(er_by_day, 7)$weekday | |
er_by_day = designate_top(er_by_day, top_er_day) | |
day_er_t = weekday_t(data, "engagement.rate", top_er_day) | |
ggplot(er_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Engagement Rate") + ggtitle("Best Day of Week to Maximize Engagement Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = percent_format()) | |
## IMPRESSIONS BY DAY | |
imp_by_day = ddply(data, .(weekday), summarise, average = mean(impressions)) | |
top_imp_day = top_n_for_day(imp_by_day, 7)$weekday | |
imp_by_day = designate_top(imp_by_day, top_imp_day) | |
day_imp_t = weekday_t(data, "impressions", top_imp_day) | |
ggplot(imp_by_day) + geom_bar(aes(weekday, average, fill=top), stat="identity")+ xlab("Time of Day") + ylab("Engagement Rate") + ggtitle("Best Day of Week to Maximize Engagement Rate")+ theme(panel.grid.major.y = element_line(colour="gray"), panel.grid.minor.x = element_blank(), plot.title = element_text(size= rel(2))) + theme(legend.text = element_text(size = 18), axis.text = element_text(size=24))+ expand_limits(y=0) + theme(panel.background = element_rect(fill = 'white'), axis.title.y=element_text(size=24)) + theme(legend.position="bottom")+ theme(strip.text.x = element_text(size = 14), strip.background = element_rect(fill='white'))+ theme(text=element_text(family="News Gothic MT", face="bold"))+ annotate("text", x = Inf, y = -Inf, label = "tomtunguz.com",hjust=1.1, vjust=-1.1, col="gray", cex=6,fontface = "bold", alpha = 0.8)+ scale_fill_gradient2(low="dodgerblue", high="red", mid="dodgerblue") + theme(legend.position="none")+ scale_y_continuous( labels = comma_format()) | |
### CORRELATIONS | |
cor(data$impressions, data$engagement.rate) | |
cor(data$retweets, data$impressions) | |
cor(data$replies, data$impressions) | |
data$tweet.length = nchar(as.character(data$Tweet.text[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey Tom,
thanks for this great script. Clemens and me took the time to make it a bit easier understandable as we feel that this script can be very useful for people that might not have a lot of experience with scripting.
We've extracted a few variables to facilitate configuration, make the charts show your chosen time zone and improved documentation to make it easier to get started with the script.
The updated gist can be found here:
https://gist.github.com/manualwise/c92fda9d884f47f43794
Thanks again,
Manuel