Last active
August 29, 2015 14:18
-
-
Save jwinternheimer/6139c3a490da48640bc7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table); library(dplyr); library(tidyr) | |
library(ggplot2); library(scales); library(grid); library(RColorBrewer) | |
## Import and Tidy Data | |
updates <- read.csv("~/Downloads/pst_updates.csv",header=T) | |
names(updates) <- c("update_id","num_chars","num_explanation","num_hashtags", | |
"hour_sent","num_followers","follower_tier","link","link_no_content", | |
"photo","text","interactions","retweets","favorites","mentions", | |
"clicks","avg_interaction_length") | |
updates <- updates %>% | |
mutate(clean_text = clean.text(text)) %>% | |
mutate(text_length = nchar(as.character(text)), | |
clean_text_length = nchar(as.character(clean_text)), | |
engagement = retweets+favorites+mentions) | |
updates$hour_sent <- as.factor(updates$hour_sent) | |
updates <- updates %>% mutate(text_length_tier = ifelse(text_length < 20, "T1 (0,20)", | |
ifelse(text_length < 40, "T2 [20,40)", | |
ifelse(text_length < 60, "T3 [40,60)", | |
ifelse(text_length < 80, "T4 [60,80)", | |
ifelse(text_length < 100, "T5 [80,100)", | |
ifelse(text_length < 120, "T6 [100,120)", | |
ifelse(text_length <= 140, "T7 [120,140]","T1 (0,20)")))))))) | |
updates <- updates %>% mutate(clean_text_length_tier = ifelse(clean_text_length < 20, "T1 (0,20)", | |
ifelse(clean_text_length < 40, "T2 [20,40)", | |
ifelse(clean_text_length < 60, "T3 [40,60)", | |
ifelse(clean_text_length < 80, "T4 [60,80)", | |
ifelse(clean_text_length < 100, "T5 [80,100)", | |
ifelse(clean_text_length < 120, "T6 [100,120)", | |
ifelse(clean_text_length <= 140, "T7 [120,140]","Undefined")))))))) | |
updates <- updates %>% | |
mutate(text_length_tier = as.factor(text_length_tier), | |
clean_text_length_tier = as.factor(clean_text_length_tier)) | |
################################################# | |
## Tweet Length Plots | |
################################################# | |
## Density Plot of Tweet Lengths | |
tweet_length_density <- ggplot(updates,aes(x=text_length)) + geom_density(color="deepskyblue1",alpha=0.4, fill="deepskyblue1") + | |
fte_theme() + scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + | |
labs(x="Tweet Length", y="Density", title="Tweet Length Density") | |
## Clean Text Length Density | |
clean_tweet_length_density <- ggplot(updates,aes(x=clean_text_length)) + geom_density(color="deepskyblue1",alpha=0.4, fill="deepskyblue1") + | |
fte_theme() + scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + | |
labs(x="Tweet Length", y="Density", title="Clean Tweet Length Density") | |
## Tweet Length vs. Engagement Scatter Plot | |
tweet_length_plot <- ggplot(filter(updates,clean_text_length > 0), aes(x=clean_text_length,y=engagement,color=follower_tier)) + | |
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + ylim(0,250) + | |
geom_smooth(aes(group=1),size=3) + labs(x="Tweet Length", y="Interactions",title="Tweet Interactions by Character Count") | |
## Filter Scatter Plot by Follower Tier | |
follower_tier_filter <- c("T01 [1,100)","T02 [100,500)","T03 [500,1000)","T04 [1000,2000)") | |
## Scatter Plot Colored by Filtered Follower Tier | |
filtered_tweet_length <- ggplot(filter(updates,follower_tier %in% follower_tier_filter), aes(x=clean_text_length,y=interactions,color=follower_tier)) + | |
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + | |
scale_y_continuous(limits=c(0,30),breaks=seq(0,30,10)) + | |
geom_smooth(aes(group=1),size=3) + | |
labs(x="Tweet Length", y="Interactions",title="Tweet Interactions by Character Count") | |
## Scatter Plot Colored by Link | |
linked_tweet_length <- ggplot(updates, aes(x=clean_text_length,y=engagement,color=link)) + | |
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + | |
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) + | |
geom_smooth(size=3) + | |
labs(x="Tweet Length", y="Interactions",title="Tweet Engagement by Character Count") | |
## Scatter Plot Colored by Image | |
photo_tweet_length <- ggplot(updates, aes(x=clean_text_length,y=engagement,color=photo)) + | |
fte_theme() + geom_point(size=1,alpha=0.3,position="jitter") + | |
scale_x_continuous(limits=c(0,140),breaks=seq(0,140,10)) + | |
scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) + | |
geom_smooth(size=3) + | |
labs(x="Tweet Length", y="Interactions",title="Tweet Engagement by Character Count") | |
## Boxplots | |
tweet_length_boxplots <- ggplot(updates, aes(x=text_length_tier, y=interactions)) + | |
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) + | |
labs(x="Tweet Length Tier", y="Interaction", title="Interaction by Tweet Length") | |
filtered_tweet_boxplots <- ggplot(filter(updates,follower_tier %in% follower_tier_filter), aes(x=text_length_tier, y=interactions)) + | |
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) + | |
labs(x="Tweet Length Tier", y="Interaction", title="Interaction by Tweet Length") | |
## Plot Number of Hashtags vs Interaction Time | |
hashtags_plot <- ggplot(feb_updates, aes(x=as.factor(num_hashtags), y=avg_interaction_length)) + | |
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) + | |
labs(x="Number of Hashtags", y="Average Interaction Time", title="Interaction Time by Number of Hashtags") | |
## Plot Number of Hashtags vs Total Interaction | |
hashtags_interaction_plot <- ggplot(feb_updates, aes(x=as.factor(num_hashtags), y=interactions)) + | |
geom_boxplot() + fte_theme() + scale_y_continuous(limits=c(0,50),breaks=seq(0,50,10)) + | |
labs(x="Number of Hashtags", y="Interactions", title="Interaction by Number of Hashtags") | |
hashtags_interaction_scatter <- ggplot(feb_updates, aes(x=clean_text_length, y=interactions,color=num_hashtags)) + | |
geom_point(alpha=0.3,size=1,position="jitter") + fte_theme() + scale_y_continuous(limits=c(0,100),breaks=seq(0,100,10)) + | |
labs(x="Tweet Length", y="Interactions", title="Interaction by Tweet Length and Hashtags") | |
#################################################17750 | |
## Linear Regression Models | |
################################################# | |
mod1 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + | |
link_no_content + with_photo + text_length, data=updates) | |
mod2 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + | |
with_photo + text_length_tier, data=updates) | |
follower_tier_filter <- c("T01 [1,100)","T02 [100,500)","T03 [500,1000)","T04 [1000,2000)","T05 [2000,4000)") | |
mod3 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + | |
with_photo + text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter)) | |
mod4 <- lm(engagement ~ num_explanation + num_hashtags + num_followers + hour_sent + link + | |
photo + clean_text_length_tier + hour_sent, data=filter(updates,clean_text_length >0)) | |
mod6 <- lm(engagement ~ num_explanation + num_hashtags + num_followers + link + photo + hour_sent + | |
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter & clean_text_length > 0)) | |
mod7 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + photo + | |
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter & clean_text_length > 0)) | |
## Log and Square Root Transformations | |
updates$logInteractions <- log(updates$interactions) | |
mod7 <- lm(logInteractions ~ num_explanation + num_hashtags + num_followers + link +with_photo + hour_sent + | |
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter)) | |
updates$rootInteractions <- sqrt(updates$interactions) | |
mod8 <- lm(rootInteractions ~ num_explanation + num_hashtags + num_followers + link +with_photo + hour_sent + | |
clean_text_length_tier, data=filter(updates,follower_tier %in% follower_tier_filter)) | |
## Using Retweets and Favorites as Responses | |
mod9 <- lm(retweets ~ num_explanation + num_hashtags + num_followers + link + photo + | |
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter)) | |
mod10 <- lm(favorites ~ num_explanation + num_hashtags + num_followers + link + photo + | |
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter)) | |
## Regression for Users with < 1000 Followers | |
mod11 <- lm(interactions ~ num_explanation + num_hashtags + num_followers + link + photo + | |
clean_text_length_tier, data=filter(feb_updates,follower_tier %in% follower_tier_filter)) | |
################################################# | |
## Machine Learning Regression | |
################################################# | |
library(caret) | |
inTrain <- createDataPartition(y=feb_updates$interactions, p=0.7, list=F) | |
trainUpdates <- feb_updates[inTrain,] | |
testUpdates <- feb_updates[-inTrain,] | |
modFit1 <- train(interactions ~ num_explanation + num_hashtags + num_followers + link + photo + | |
clean_text_length_tier, data=trainUpdates, method="lm") | |
################################################# | |
## Clean Text Function | |
################################################# | |
clean.text <- function(some_txt) { | |
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt) | |
some_txt = gsub("@\\w+", "", some_txt) | |
some_txt = gsub("[[:punct:]]", "", some_txt) | |
some_txt = gsub("[[:digit:]]", "", some_txt) | |
some_txt = gsub("http\\w+", "", some_txt) | |
some_txt = gsub("[ \t]{2,}", "", some_txt) | |
some_txt = gsub("^\\s+|\\s+$", "", some_txt) | |
some_txt = gsub("amp", "", some_txt) | |
return(some_txt) | |
} | |
################################################# | |
## ggplot Theme | |
################################################# | |
## ggplot Theme | |
fte_theme <- function() { | |
# Generate the colors for the chart procedurally with RColorBrewer | |
palette <- brewer.pal("Greys", n=9) | |
color.background = palette[2] | |
color.grid.major = palette[3] | |
color.axis.text = palette[6] | |
color.axis.title = palette[7] | |
color.title = palette[9] | |
# Begin construction of chart | |
theme_bw(base_size=9) + | |
# Set the entire chart region to a light gray color | |
theme(panel.background=element_rect(fill=color.background, color=color.background)) + | |
theme(plot.background=element_rect(fill=color.background, color=color.background)) + | |
theme(panel.border=element_rect(color=color.background)) + | |
# Format the grid | |
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) + | |
theme(panel.grid.minor=element_blank()) + | |
theme(axis.ticks=element_blank()) + | |
theme(panel.grid.major.x = element_blank(),panel.grid.minor.x = element_blank()) + | |
# Format the legend, but hide by default | |
theme(legend.background = element_rect(fill=color.background)) + | |
theme(legend.text = element_text(size=15,color=color.axis.title)) + | |
# Set title and axis labels, and format these and tick marks | |
theme(plot.title=element_text(color=color.title, size=20, vjust=1.25)) + | |
theme(axis.text.x=element_text(size=10,color=color.axis.text)) + | |
theme(axis.text.y=element_text(size=10,color=color.axis.text)) + | |
theme(axis.title.x=element_text(size=15,color=color.axis.title, vjust=0)) + | |
theme(axis.title.y=element_text(size=15,color=color.axis.title, vjust=1.25)) + | |
# Plot margins | |
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment