Created
July 13, 2015 07:34
-
-
Save jwinternheimer/3ccdf7c07fc0f1f524a3 to your computer and use it in GitHub Desktop.
Update Text Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table); library(dplyr); library(tidyr) | |
library(ggplot2); library(scales); library(grid); library(RColorBrewer) | |
## Import Dataset | |
updates <- read.csv("~/Downloads/updates.csv",header=T) | |
names(updates) <- c("update_id","profile_id","num_chars","num_explanation","num_hashtags", | |
"hour_sent","num_followers","follower_tier","link","link_no_content", | |
"photo","text","interactions","retweets","favorites","mentions", | |
"clicks","avg_interaction_length") | |
## Clean the text and count characters | |
updates <- updates %>% | |
mutate(clean_text = clean.text(text)) %>% | |
mutate(text_length = nchar(as.character(text)), | |
clean_text_length = nchar(as.character(clean_text)), | |
engagement = retweets+favorites+mentions, | |
engagement_per_follower = (retweets+favorites+mentions)/num_followers) | |
## Set hour variable as a factor | |
updates$hour_sent <- as.factor(updates$hour_sent) | |
## Manually create text length tiers | |
updates <- updates %>% | |
mutate(text_length_tier = ifelse(text_length < 20, "T1 (0,20)", | |
ifelse(text_length < 40, "T2 [20,40)", | |
ifelse(text_length < 60, "T3 [40,60)", | |
ifelse(text_length < 80, "T4 [60,80)", | |
ifelse(text_length < 100, "T5 [80,100)", | |
ifelse(text_length < 120, "T6 [100,120)", | |
ifelse(text_length <= 140, "T7 [120,140]","T1 (0,20)")))))))) | |
## Manually create tiers for clean text length | |
updates <- updates %>% | |
mutate(clean_text_length_tier = ifelse(clean_text_length < 20, "T1 (0,20)", | |
ifelse(clean_text_length < 40, "T2 [20,40)", | |
ifelse(clean_text_length < 60, "T3 [40,60)", | |
ifelse(clean_text_length < 80, "T4 [60,80)", | |
ifelse(clean_text_length < 100, "T5 [80,100)", | |
ifelse(clean_text_length < 120, "T6 [100,120)", | |
ifelse(clean_text_length <= 140, "T7 [120,140]","Undefined")))))))) | |
## Add Clean Text Length Tier as Factor | |
updates <- updates %>% | |
mutate(text_length_tier = as.factor(text_length_tier), | |
clean_text_length_tier = as.factor(clean_text_length_tier)) | |
## Save Data | |
save(updates,file="~/Google Drive/R_data/updates.Rda") | |
################################################# | |
## Engagement Visualization | |
################################################# | |
## Engagement CDF | |
engagement_cdf <- ggplot(updates,aes(x=engagement)) + | |
stat_ecdf(size=1,color="#547c9f") + | |
theme_minimal() + | |
coord_cartesian(xlim=c(0,200)) + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Engagement",y="Percent of Tweets",title="Engagement CDF") | |
## Engagement CDF by Follower Tier | |
engagement_cdf_followers <- ggplot(filter(updates,follower_tier != "T00 (-inf,1)"),aes(x=engagement, color=follower_tier)) + | |
stat_ecdf(size=1) + | |
theme_minimal() + | |
coord_cartesian(xlim=c(0,200)) + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Engagemnet",y="Percent of Tweets",title="Engagement CDF") | |
## Engagement Per Follower CDF | |
EpF_cdf <- ggplot(updates,aes(x=engagement_per_follower)) + | |
stat_ecdf(size=1,color="#547c9f") + | |
coord_cartesian(xlim=c(0,0.015)) + | |
theme_minimal() + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF") | |
## Engagement Per Follower Density | |
EpF_density <- ggplot(updates,aes(x=engagement_per_follower)) + | |
geom_density(alpha=0.8,fill="#547c9f",color="#547c9f") + | |
buffer_theme() + | |
scale_x_continuous(limits=c(0,0.005),expand=c(0,0.0001)) + | |
scale_y_continuous(breaks=seq(0,1500,500),expand=c(0,0)) + | |
labs(x="Engagement Per Follower", y="Number of Tweets", title="Engagement Per Follower Density") + | |
theme(text = element_text(size=15)) | |
################################################# | |
## Text Length | |
################################################# | |
## CDF of Tweet Lengths | |
tweet_length_cdf <- ggplot(updates,aes(x=clean_text_length,color=link)) + | |
stat_ecdf(size=1) + theme_minimal() + coord_cartesian(xlim=c(0,150)) + | |
scale_x_continuous(breaks=seq(0,150,25)) + | |
labs(x="Clean Text Length",y="Percent of Tweets",color="Includes Link?") | |
## Density Plot of Tweet Lengths | |
tweet_length_density <- ggplot(updates,aes(x=text_length)) + | |
geom_density(color="#547c9f",alpha=0.9,fill="#547c9f") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10),expand=c(0,2)) + | |
scale_y_continuous(limits=c(0,0.02),expand=c(0,0)) + | |
buffer_theme() + | |
labs(x="Tweet Length", y="Percent of Tweets", title="Tweet Length Density",fill="Includes Link?") | |
## Clean Text Length Density | |
clean_tweet_length_density <- ggplot(updates,aes(x=clean_text_length)) + | |
geom_density(alpha=0.9,color="#547c9f",fill="#547c9f") + | |
buffer_theme() + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10),expand=c(0,2)) + | |
labs(x="Tweet Length", y="Percent of Tweets", title="Clean Tweet Length Density",fill="Includes Link?") + | |
scale_y_continuous(expand=c(0,0)) | |
## Tweet Length vs. Engagement Scatter Plot | |
tweet_length_scatter <- ggplot(filter(updates,text_length > 0 & num_followers > 0), aes(x=text_length,y=engagement_per_follower)) + | |
geom_point(size=1,alpha=0.3,position="jitter",colour="grey60") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,20),expand=c(0,2)) + | |
scale_y_continuous(expand=c(0,0),limits=c(0,0.015)) + | |
geom_smooth(aes(group=1),size=1,se=T,color="#547c9f") + | |
labs(x="Tweet Length", y="",title="Tweet Engagement by Character Count") + | |
buffer_theme() | |
## Tweet Length vs. Engagement Scatter Plot by Link | |
tweet_length_scatter <- ggplot(filter(updates,text_length > 0 & num_followers > 0), aes(x=text_length,y=engagement_per_follower,color=link)) + | |
geom_point(size=1,alpha=0.3,position="jitter") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,20)) + | |
ylim(0,0.015) + | |
#scale_y_continuous(limits=c(0,0.015),breaks=seq(0,0.015,0.005)) | |
stat_smooth(size=2,se=T) + | |
labs(x="Tweet Length", y="Engagement",title="Tweet Engagement by Character Count",color="Includes Link?") + | |
theme_minimal() | |
## Group Data by Text Length Tier | |
by_text_length_tier <- updates %>% | |
filter(num_followers >0 & !is.na(engagement_per_follower)) %>% | |
group_by(text_length_tier) %>% | |
summarise(tweets=n(),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Bar Plot of Average Engagement by Text Length Tier | |
engagement_by_text_length_tier <- ggplot(by_text_length_tier, aes(x=text_length_tier, y=avg_engagement_per_follower)) + | |
geom_bar(stat="identity",fill="#547c9f") + | |
labs(x="Text Length Tier", y="Average Engagement Per Follower",title="Average Engagement Per Follower") + | |
buffer_theme() + scale_y_continuous(expand=c(0,0)) | |
################################################# | |
## Links | |
################################################# | |
## Group Data by Inclusion of Link | |
by_link <- updates %>% | |
filter(num_followers >0 & !is.na(engagement_per_follower)) %>% | |
group_by(link) %>% | |
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Group by Link and Follower Tier | |
by_link_tier <- updates %>% | |
filter(num_followers >0 & !is.na(engagement_per_follower)) %>% | |
group_by(link,follower_tier) %>% | |
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Plot Pie Chart of Inclusion of Link | |
ggplot(by_link,aes(x=factor(1),y=percent,fill=link)) + | |
geom_bar(stat="identity",width=1) + | |
coord_polar(theta="y") + | |
theme_minimal() + | |
#geom_text(aes(y = percent/2,2 + c(0, cumsum(percent)[-length(percent)]), label = percent), size=8) + | |
theme(axis.ticks = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) + | |
labs(x="",y="",title="Percent of Tweets With Links",fill="Includes Link?") | |
## Plot CDF of Engagement Per Follower, by Link | |
link_engagement_cdf <- ggplot(updates,aes(x=engagement_per_follower,color=link)) + | |
stat_ecdf(size=1) + | |
coord_cartesian(xlim=c(0,0.010)) + | |
scale_x_continuous(expand=c(0,0)) + | |
buffer_theme() + | |
scale_y_continuous(breaks=seq(0,1,0.2),expand=c(0,0)) + | |
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF",color="Includes Link?") | |
## Plot CDF of Engagement, by Link | |
engagement_cdf <- ggplot(updates,aes(x=engagement,color=link)) + | |
stat_ecdf(size=1) + | |
coord_cartesian(xlim=c(0,500)) + | |
theme_minimal() + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Total Engagement",y="Percent of Tweets",title="Total Engagement CDF",color="Includes Link?") | |
## Plot Average Engagement Per Follower by Inclusion of Link | |
engagement_by_link <- ggplot(by_link, aes(x=link, y=avg_engagement_per_follower)) + | |
geom_bar(stat="identity",fill="#547c9f") + | |
labs(x="Includes Link",y="",title="Average Engagement Per Follower",fill="Includes Link?") + | |
buffer_theme() + | |
scale_y_continuous(limits=c(0,0.0025),expand=c(0,0)) | |
## Plot Average Engagement Per Follower by Inclusion of Link | |
ggplot(by_link, aes(x=link, y=avg_engagement)) + | |
geom_bar(stat="identity") + | |
scale_y_continuous(limits=c(0,400),expand=c(0,0)) + | |
labs(x="Includes Link",y="",title="Average Total Engagement",fill="Includes Link?") + | |
buffer_theme() | |
## Plot Average Engagement by Inclusion of Link, by Follower Tier | |
engagement_by_link <- ggplot(filter(by_link_tier,follower_tier != "T12 [200000,inf)"), aes(x=follower_tier, y=avg_engagement,fill=link)) + | |
geom_bar(stat="identity",position="dodge") + | |
labs(x="Includes Link", y="Average Engagement",title="Average Engagement by Inclusion of Link",fill="Includes Link?") + | |
theme_minimal() + scale_y_continuous(expand=c(0,0)) + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
################################################# | |
## Images | |
################################################# | |
## Group Data by Inclusion of Image | |
by_image <- updates %>% | |
filter(num_followers > 0 & !is.na(engagement_per_follower)) %>% | |
group_by(photo) %>% | |
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Group by Image and Follower Tier | |
by_image_tier <- updates %>% | |
filter(num_followers >0 & !is.na(engagement_per_follower)) %>% | |
group_by(photo,follower_tier) %>% | |
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Plot Pie Chart of Inclusion of Link | |
ggplot(by_image,aes(x=factor(1),y=percent,fill=photo)) + | |
geom_bar(stat="identity",width=1) + | |
coord_polar(theta="y") + | |
theme_minimal() + | |
theme(axis.ticks = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) + | |
labs(x="",y="",title="Percent of Tweets With Image",fill="Includes Image?") | |
## Plot CDF of Engagement Per Follower, by Image | |
image_engagement_cdf <- ggplot(updates,aes(x=engagement_per_follower,color=photo)) + | |
stat_ecdf(size=1) + | |
coord_cartesian(xlim=c(0,0.0075)) + | |
theme_minimal() + | |
theme(text = element_text(size=25)) + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Engagement Per Follower",y="Percent of Tweets",title="Engagement Per Follower CDF",color="Includes Image?") | |
## Plot CDF of Total Engagement, by Image | |
total_image_engagement_cdf <- ggplot(updates,aes(x=engagement,color=photo)) + | |
stat_ecdf(size=1) + | |
coord_cartesian(xlim=c(0,400)) + | |
theme_minimal() + | |
scale_y_continuous(breaks=seq(0,1,0.2)) + | |
labs(x="Total Engagement",y="Percent of Tweets",title="Total Engagment CDF",color="Includes Image?") | |
## Plot Average Engagement Per Follower by Inclusion of Image | |
avg_engagement_image <- ggplot(by_image, aes(x=photo, y=avg_engagement_per_follower)) + | |
geom_bar(stat="identity",fill="#547c9f") + | |
labs(x="Includes Image",y="",title="Average Engagement Per Follower",fill="Includes Image?") + | |
buffer_theme() + | |
scale_y_continuous(limits=c(0,0.0025),expand=c(0,0)) | |
## Plot Average Total Engagement Per Follower by Inclusion of Image | |
ggplot(by_image, aes(x=photo, y=avg_engagement)) + | |
geom_bar(stat="identity") + | |
scale_y_continuous(limits=c(0,200),expand=c(0,0)) + | |
labs(x="Includes Image",y="",title="Average Total Engagement",fill="Includes Image?") + | |
buffer_theme() | |
################################################# | |
## Hashtags | |
################################################# | |
hashtag_hist <- ggplot(updates,aes(x=num_hashtags)) + | |
geom_histogram(color="white",fill="#547c9f",binwidth=1) + | |
coord_cartesian(xlim=c(0,10)) + | |
scale_x_continuous(breaks=seq(0,10,1)) + | |
scale_y_continuous(expand=c(0,0)) + | |
labs(x="Hashtags", y="Number of Tweets", title="Distribution of Hashtags") + | |
buffer_theme() | |
## Group Data by Number of Hashtags | |
by_hash <- updates %>% | |
filter(num_followers > 0 & !is.na(engagement_per_follower)) %>% | |
group_by(num_hashtags) %>% | |
summarise(tweets=n(),percent=round(n()/750776,3),avg_interactions = mean(interactions), avg_retweets = mean(retweets), | |
avg_favorites = mean(favorites), avg_mentions = mean(mentions), | |
avg_clicks = mean(clicks),avg_engagement = mean(engagement), | |
avg_engagement_per_follower = mean(engagement_per_follower,na.rm=T)) | |
## Plot Average Engagement Per Follower by Number of Hashtags | |
avg_engagement_hash <- ggplot(filter(by_hash,num_hashtags<11), aes(x=num_hashtags, y=avg_engagement_per_follower)) + | |
geom_bar(stat="identity",fill="#547c9f") + | |
labs(x="Hashtags",y="",title="Average Engagement Per Follower",fill="Number of Hashtags") + | |
buffer_theme() + | |
scale_y_continuous(limits=c(0,0.0035),expand=c(0,0)) + | |
scale_x_continuous(breaks=seq(0,10,1)) | |
## Plot Average Total Engagement by Number of Hashtags | |
avg_engagement_hash <- ggplot(filter(by_hash,num_hashtags<11), aes(x=num_hashtags, y=avg_engagement)) + | |
geom_bar(stat="identity",fill="#547c9f") + | |
labs(x="Hashtags",y="",title="Average Total Engagement",fill="Number of Hashtags") + | |
theme_minimal() + | |
scale_x_continuous(breaks=seq(0,10,1)) | |
################################################# | |
## Clean Text Function | |
################################################# | |
clean.text <- function(some_txt) { | |
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) | |
some_txt = gsub("@\\w+", "", some_txt) | |
some_txt = gsub("[[:punct:]]", "", some_txt) | |
some_txt = gsub("[[:digit:]]", "", some_txt) | |
some_txt = gsub("http\\w+", "", some_txt) | |
some_txt = gsub("[ \t]{2,}", "", some_txt) | |
some_txt = gsub("^\\s+|\\s+$", "", some_txt) | |
some_txt = gsub("amp", "", some_txt) | |
return(some_txt) | |
} | |
################################################# | |
## ggplot Theme | |
################################################# | |
## ggplot Theme | |
buffer_palette <- c("#3c5a72", "#547c9f", "#6295c0", "#72b0e3") | |
buffer_theme <- function() { | |
# Generate the colors for the chart procedurally with RColorBrewer | |
palette <- brewer.pal("Greys", n=9) | |
color.background = palette[2] | |
color.grid.major = palette[3] | |
color.axis.text = palette[6] | |
color.axis.title = palette[7] | |
color.title = palette[9] | |
# Begin construction of chart | |
theme_bw(base_size=9) + | |
# Set the entire chart region to a light gray color | |
theme(panel.background=element_rect(fill=NA, color=NA)) + | |
theme(plot.background=element_rect(fill=NA, color=NA)) + | |
theme(panel.border=element_rect(color=NA)) + | |
# Format the grid | |
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) + | |
theme(panel.grid.minor=element_blank()) + | |
theme(axis.ticks=element_blank()) + | |
theme(panel.grid.major.x = element_blank(),panel.grid.minor.x = element_blank()) + | |
theme(panel.border = element_blank(), axis.line = element_line(color=color.grid.major)) + | |
# Format the legend, but hide by default | |
theme(legend.background = element_rect(fill=NA)) + | |
theme(legend.text = element_text(size=15,color=color.axis.title)) + | |
# Set title and axis labels, and format these and tick marks | |
theme(plot.title=element_text(color=color.title, size=20, vjust=1.25)) + | |
theme(axis.text.x=element_text(size=10,color=color.axis.text)) + | |
theme(axis.text.y=element_text(size=10,color=color.axis.text)) + | |
theme(axis.title.x=element_text(size=15,color=color.axis.title, vjust=0)) + | |
theme(axis.title.y=element_text(size=15,color=color.axis.title, vjust=1.25)) + | |
# Plot margins | |
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm")) + | |
theme(text = element_text(size=25)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment