Last active
August 29, 2015 14:21
-
-
Save jwinternheimer/4c36ce1efe29ecd48b89 to your computer and use it in GitHub Desktop.
churn_exploration.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table); library(dplyr); library(tidyr) | |
library(ggplot2); library(scales); library(grid); library(RColorBrewer) | |
############################################################### | |
## MOST COMMON ACTIONS DURING FIRST TWO WEEKS | |
############################################################### | |
## IMPORT AND TIDY DATA | |
healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",") | |
churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",") | |
names(healthy_user_actions) <- c("user_id","action","count") | |
names(churned_user_actions) <- c("user_id","action","count") | |
## GROUP BY ACTION, BIND DATASETS | |
healthy_by_action <- healthy_user_actions %>% | |
group_by(action) %>% | |
summarise(users=n_distinct(user_id),total_count = sum(count)) %>% | |
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>% | |
arrange(desc(count_per_user)) | |
churned_by_action <- churned_user_actions %>% | |
group_by(action) %>% | |
summarise(users=n_distinct(user_id),total_count = sum(count)) %>% | |
mutate(count_per_user = total_count/users, user_type=as.factor("churned")) | |
total_by_action <- rbind(healthy_by_action,churned_by_action) | |
save(total_by_action, file="~/Google Drive/total_by_action.Rda") | |
## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER | |
target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action | |
target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action | |
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action | |
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action | |
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action | |
## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10) | |
action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1), | |
aes(x=action,y=count_per_user,fill=user_type)) + | |
geom_bar(stat="identity",position="dodge") + | |
coord_flip() + buffer_theme() + | |
labs(x="",y="Average Actions Per User") | |
action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2), | |
aes(x=action,y=count_per_user,fill=user_type)) + | |
geom_bar(stat="identity",position="dodge") + | |
coord_flip() + buffer_theme() + | |
labs(x="",y="Average Actions Per User") | |
action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3), | |
aes(x=action,y=count_per_user,fill=user_type)) + | |
geom_bar(stat="identity",position="dodge") + | |
coord_flip() + buffer_theme() + | |
labs(x="",y="Average Actions Per User") | |
action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4), | |
aes(x=action,y=count_per_user,fill=user_type)) + | |
geom_bar(stat="identity",position="dodge") + | |
coord_flip() + buffer_theme() + | |
labs(x="",y="Average Actions Per User") | |
action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5), | |
aes(x=action,y=count_per_user,fill=user_type)) + | |
geom_bar(stat="identity",position="dodge") + | |
coord_flip() + buffer_theme() + | |
labs(x="",y="Average Actions Per User") | |
############################################################### | |
## UPDATES BEHAVIOR | |
############################################################### | |
## IMPORT AND TIDY DATA | |
churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T) | |
healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T) | |
names(churned_update_actions) <- c("user_id","action","count") | |
names(healthy_update_actions) <- c("user_id","action","count") | |
churned_update_actions$user_type <- as.factor("churned") | |
healthy_update_actions$user_type <- as.factor("healthy") | |
total_update_actions <- rbind(churned_update_actions,healthy_update_actions) | |
## GROUP BY ACTION, BIND DATASETS | |
healthy_updates_by_action <- healthy_update_actions %>% | |
group_by(action) %>% | |
summarise(users=n_distinct(user_id),total_count = sum(count)) %>% | |
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>% | |
arrange(desc(count_per_user)) | |
churned_updates_by_action <- churned_update_actions %>% | |
group_by(action) %>% | |
summarise(users=n_distinct(user_id),total_count = sum(count)) %>% | |
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>% | |
arrange(desc(count_per_user)) | |
total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action) | |
save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda") | |
## TARGET ACTIONS - FIND MOST COMMON ACTIONS | |
target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action | |
target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action | |
target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action | |
target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action | |
target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action | |
target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action | |
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action | |
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action | |
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action | |
## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS | |
## CDF OF ACTIONS COUNT | |
updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1), | |
aes(x=count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) + | |
labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") + | |
scale_y_continuous(breaks=seq(0,1.00,0.2)) | |
updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2), | |
aes(x=count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) + | |
labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") + | |
scale_y_continuous(breaks=seq(0,1.00,0.2)) | |
updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4), | |
aes(x=count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) + | |
labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") + | |
scale_y_continuous(breaks=seq(0,1.00,0.2)) | |
updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5), | |
aes(x=count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) + | |
labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") + | |
scale_y_continuous(breaks=seq(0,1.00,0.2)) | |
############################################################### | |
## INTERACTIONS PER UPDATE | |
############################################################### | |
## IMPORT AND TIDY DATA | |
churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T) | |
healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T) | |
names(churned_interactions) <- c("user_id","service","interactions","updates") | |
names(healthy_interactions) <- c("user_id","service","interactions","updates") | |
churned_interactions$user_type <- as.factor("churned") | |
healthy_interactions$user_type <- as.factor("healthy") | |
total_interactions <- rbind(churned_interactions,healthy_interactions) | |
total_interactions <- total_interactions %>% | |
mutate(interaction_per_update = interactions/updates) | |
## CDFS OF INTERACTIONS | |
twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"), | |
aes(x=interaction_per_update,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+ | |
labs(x="Interactions per Tweet",y="Percent of Users") | |
facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"), | |
aes(x=interaction_per_update,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+ | |
labs(x="Interactions per Facebook Post",y="Percent of Users") | |
linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"), | |
aes(x=interaction_per_update,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+ | |
labs(x="Interactions per Linkedin Post",y="Percent of Users") | |
google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"), | |
aes(x=interaction_per_update,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+ | |
labs(x="Interactions per Google+ Post",y="Percent of Users") | |
############################################################### | |
## EXTENSION UPDATES | |
############################################################### | |
## IMPORT AND TIDY DATA | |
healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T) | |
churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T) | |
names(healthy_extension) <- c("user_id","action","count") | |
names(churned_extension) <- c("user_id","action","count") | |
healthy_extension$user_type <- as.factor("healthy") | |
churned_extension$user_type <- as.factor("churned") | |
total_extension_use <- rbind(healthy_extension,churned_extension) | |
extension_by_user <- total_extension_use %>% | |
group_by(user_id, user_type) %>% | |
summarise(updates_shared = sum(count)) | |
## CDF OF EXTENSION USAGE | |
extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.2)) + | |
labs(x="Updates Shared From Extension",y="Percent of Users") | |
############################################################### | |
## NUMBER OF PROFILES CONNECTED | |
############################################################### | |
## IMPORT AND TIDY DATA | |
churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T) | |
healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T) | |
names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles") | |
names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles") | |
churned_profiles$user_type <- as.factor("churned") | |
healthy_profiles$user_type <- as.factor("healthy") | |
total_profiles <- rbind(churned_profiles,healthy_profiles) | |
total_profiles <- total_profiles %>% | |
mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles) | |
## CDF OF PROFILES | |
profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Profiles",y="Percent of Users") | |
twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Twitter Profiles",y="Percent of Users") | |
fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Facebook Profiles",y="Percent of Users") | |
linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Linkedin Profiles",y="Percent of Users") | |
############################################################### | |
## TEAMMATES | |
############################################################### | |
## IMPORT AND TIDY DATA | |
healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T) | |
churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T) | |
names(healthy_teammates) <- c("user_id","teammates") | |
names(churned_teammates) <- c("user_id","teammates") | |
healthy_teammates$user_type <- as.factor('healthy') | |
churned_teammates$user_type <- as.factor('churned') | |
total_teammates <- rbind(churned_teammates,healthy_teammates) | |
## MOBILE UPDATES CDF | |
teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Teammates",y="Percent of Users") | |
############################################################### | |
## ANALYTICS ACTIONS | |
############################################################### | |
## IMPORT AND TIDY DATA | |
healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T) | |
churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T) | |
names(healthy_analytics) <- c("user_id","action_count") | |
names(churned_analytics) <- c("user_id","action_count") | |
healthy_analytics$user_type <- as.factor('healthy') | |
churned_analytics$user_type <- as.factor('churned') | |
total_analytics <- rbind(churned_analytics,healthy_analytics) | |
## ANALYTICS ACTIONS CDF | |
analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Analytics Actions",y="Percent of Users") | |
############################################################### | |
## MOBILE UPDATES | |
############################################################### | |
## IMPORT AND TIDY DATA | |
healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T) | |
churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T) | |
names(healthy_mobile) <- c("user_id","action","count") | |
names(churned_mobile) <- c("user_id","action","count") | |
healthy_mobile$user_type <- as.factor('healthy') | |
churned_mobile$user_type <- as.factor('churned') | |
total_mobile <- rbind(churned_mobile,healthy_mobile) | |
## MOBILE UPDATES CDF | |
mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) + | |
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) + | |
scale_y_continuous(breaks=seq(0,1.00,0.1)) + | |
labs(x="Number of Updates",y="Percent of Users") | |
############################################################### | |
## ACTION COUNT ANALYSIS | |
############################################################### | |
## IMPORT AND TIDY DATA | |
churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T) | |
healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T) | |
names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week", | |
"changed_schedule","used_extension","used_optimal_timing", | |
"facebook_profiles","linkedin_profiles","pinterest_profiles", | |
"twitter_profiles","team_members","total_updates","action","count") | |
names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week", | |
"changed_schedule","used_extension","used_optimal_timing", | |
"facebook_profiles","linkedin_profiles","pinterest_profiles", | |
"twitter_profiles","team_members","total_updates","action","count") | |
churn_actions$user_type <- as.factor("churned") | |
healthy_actions$user_type <- as.factor("healthy") | |
all_actions <- rbind(churn_actions,healthy_actions) | |
## TARGET CERTAIN ACTIONS | |
target_actions <- c("dashboard loaded","dashboard profiles selected", | |
"dashboard schedule changed_schedule changed_time", | |
"dashboard schedule changed_schedule changed_day", | |
"dashboard schedule changed_schedule added time", | |
"dashboard analytics viewed basic", | |
"dashboard analytics viewed advanced", | |
"dashboard updates shared composer later", | |
"dashboard updates shared composer now", | |
"dashboard composer media added_photo", | |
"dashboard suggestions viewed", | |
"dashboard updates shared suggestions", | |
"dashboard updates shared composer schedule", | |
"dashboard viewed sent_posts", | |
"dashboard queue changed_update edited composer", | |
"dashboard queue changed_update edited inline", | |
"dashboard queue changed_update deleted", | |
"dashboard updates shared contributions", | |
"dashboard feeds viewed general", | |
"dashboard queue changed_update custom_scheduled", | |
"dashboard updates shared feeds", | |
"extension viewed","extension updates shared composer later", | |
"extension composer media added_photo", | |
"extension composer media added_link", | |
"share sent", "share cancelled") | |
filtered_actions <- all_actions %>% | |
filter(action %in% target_actions) | |
## CONVERT DATA FRAME FROM LONG TO WIDE | |
actions_wide <- spread(filtered_actions,action,count) | |
actions_wide[is.na(actions_wide)] <- 0 | |
actions_wide <- select(actions_wide,-total_updates) | |
names(actions_wide) <- make.names(names(actions_wide)) | |
############################################################### | |
## MACHINE LEARNING ALGORITHMS | |
############################################################### | |
library(caret); library(rattle) | |
inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F) | |
training <- actions_wide[inTrain,] | |
testing <- actions_wide[-inTrain,] | |
names(training) <- make.names(names(training)) | |
names(testing) <- make.names(names(testing)) | |
############################################################### | |
## DECISION TREE | |
############################################################### | |
library(rpart) | |
treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class") | |
## PLOT DECISION TREE | |
fancyRpartPlot(treeFit,cex=0.7) | |
## TEST DECISION TREE | |
actions_wide$tree_predictions <- predict(treeFit,actions_wide) | |
############################################################### | |
## RANDOM FORESTS | |
############################################################### | |
## FIT RANDOM FORESTS MODEL | |
rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T) | |
save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda") | |
## FIT A SECOND MODEL, FOR KICKS | |
rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update))) | |
## CHECK PREDICTIONS OF FIRST MODEL | |
rf_predictions <- predict(rf_fit,testing) | |
testing$rf_prediction_right <- rf_predictions == testing$user_type | |
table(rf_predictions,testing$rf_prediction_right) | |
## CHECK PREDICTIONS OF SECOND MODEL | |
rf2_predictions <- predict(rf_fit2,testing) | |
testing$predRight2 <- rf2_predictions == testing$user_type | |
table(rf2_predictions,testing$predRight2) | |
## SET PREDICTION COLUMN IN ORIGINAL DATA | |
actions_wide$rf_prediction <- predict(rf_fit,actions_wide) | |
## PRESENT USERS, PREDICTIONS | |
predicted_users <- actions_wide %>% | |
select(-(days_with_update:team_members)) %>% | |
select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>% | |
filter(rf_prediction == "churned" & user_type == "healthy") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment