jwinternheimer · August 29, 2015 14:21
diff --git a/churn_exploration.R b/churn_exploration.R
 library(data.table); library(dplyr); library(tidyr)
 library(ggplot2); library(scales); library(grid); library(RColorBrewer)

 ###############################################################
 ## MOST COMMON ACTIONS DURING FIRST TWO WEEKS
 ###############################################################

 ## IMPORT AND TIDY DATA
 healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",")
 churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",")

 names(healthy_user_actions) <- c("user_id","action","count")
 names(churned_user_actions) <- c("user_id","action","count")


 ## GROUP BY ACTION, BIND DATASETS
 healthy_by_action <- healthy_user_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))

 churned_by_action <- churned_user_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("churned"))

 total_by_action <- rbind(healthy_by_action,churned_by_action)
 save(total_by_action, file="~/Google Drive/total_by_action.Rda")

 ## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER
 target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action
 target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action
 target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
 target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
 target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


 ## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10)
 action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1),
                          aes(x=action,y=count_per_user,fill=user_type)) +
                          geom_bar(stat="identity",position="dodge") + 
                          coord_flip() + buffer_theme() +
                          labs(x="",y="Average Actions Per User")


 action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2),
                           aes(x=action,y=count_per_user,fill=user_type)) +
                           geom_bar(stat="identity",position="dodge") + 
                           coord_flip() + buffer_theme() +
                           labs(x="",y="Average Actions Per User")



 action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") + 
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")



 action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") + 
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")



 action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") + 
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")



 ###############################################################
 ## UPDATES BEHAVIOR 
 ###############################################################

 ## IMPORT AND TIDY DATA
 churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T)
 healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T)

 names(churned_update_actions) <- c("user_id","action","count")
 names(healthy_update_actions) <- c("user_id","action","count")

 churned_update_actions$user_type <- as.factor("churned")
 healthy_update_actions$user_type <- as.factor("healthy")

 total_update_actions <- rbind(churned_update_actions,healthy_update_actions)


 ## GROUP BY ACTION, BIND DATASETS
 healthy_updates_by_action <- healthy_update_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))


 churned_updates_by_action <- churned_update_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))

 total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action)
 save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda")


 ## TARGET ACTIONS - FIND MOST COMMON ACTIONS
 target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action
 target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action
 target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action
 target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action
 target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action
 target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action
 target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
 target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
 target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


 ## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS

 ## CDF OF ACTIONS COUNT
 updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1),
                           aes(x=count,color=user_type)) +
                           stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) +
                           labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") +
                           scale_y_continuous(breaks=seq(0,1.00,0.2))


 updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2),
                      aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))


 updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4),
                       aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))


 updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5),
                       aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))

 ###############################################################
 ## INTERACTIONS PER UPDATE
 ###############################################################

 ## IMPORT AND TIDY DATA
 churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T)
 healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T)

 names(churned_interactions) <- c("user_id","service","interactions","updates")
 names(healthy_interactions) <- c("user_id","service","interactions","updates")

 churned_interactions$user_type <- as.factor("churned")
 healthy_interactions$user_type <- as.factor("healthy")

 total_interactions <- rbind(churned_interactions,healthy_interactions)
 total_interactions <- total_interactions %>%
  mutate(interaction_per_update = interactions/updates)


 ## CDFS OF INTERACTIONS
 twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"),
                                  aes(x=interaction_per_update,color=user_type)) +
                                  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+
  labs(x="Interactions per Tweet",y="Percent of Users")


 facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+
  labs(x="Interactions per Facebook Post",y="Percent of Users")


 linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
  labs(x="Interactions per Linkedin Post",y="Percent of Users")


 google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
  labs(x="Interactions per Google+ Post",y="Percent of Users")



 ###############################################################
 ## EXTENSION UPDATES
 ###############################################################

 ## IMPORT AND TIDY DATA
 healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T)
 churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T)

 names(healthy_extension) <- c("user_id","action","count")
 names(churned_extension) <- c("user_id","action","count")

 healthy_extension$user_type <- as.factor("healthy")
 churned_extension$user_type <- as.factor("churned")

 total_extension_use <- rbind(healthy_extension,churned_extension)

 extension_by_user <- total_extension_use %>%
  group_by(user_id, user_type) %>%
  summarise(updates_shared = sum(count))


 ## CDF OF EXTENSION USAGE

 extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.2)) +
  labs(x="Updates Shared From Extension",y="Percent of Users")


 ###############################################################
 ## NUMBER OF PROFILES CONNECTED
 ###############################################################


 ## IMPORT AND TIDY DATA
 churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T)
 healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T)

 names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
 names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")

 churned_profiles$user_type <- as.factor("churned")
 healthy_profiles$user_type <- as.factor("healthy")

 total_profiles <- rbind(churned_profiles,healthy_profiles)
 total_profiles <- total_profiles %>%
  mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles)


 ## CDF OF PROFILES
 profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Profiles",y="Percent of Users")


 twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Twitter Profiles",y="Percent of Users")


 fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Facebook Profiles",y="Percent of Users")


 linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Linkedin Profiles",y="Percent of Users")


 ###############################################################
 ## TEAMMATES
 ###############################################################

 ## IMPORT AND TIDY DATA
 healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T)
 churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T)

 names(healthy_teammates) <- c("user_id","teammates")
 names(churned_teammates) <- c("user_id","teammates")

 healthy_teammates$user_type <- as.factor('healthy')
 churned_teammates$user_type <- as.factor('churned')

 total_teammates <- rbind(churned_teammates,healthy_teammates)


 ## MOBILE UPDATES CDF
 teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Teammates",y="Percent of Users")


 ###############################################################
 ## ANALYTICS ACTIONS
 ###############################################################

 ## IMPORT AND TIDY DATA
 healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T)
 churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T)

 names(healthy_analytics) <- c("user_id","action_count")
 names(churned_analytics) <- c("user_id","action_count")

 healthy_analytics$user_type <- as.factor('healthy')
 churned_analytics$user_type <- as.factor('churned')

 total_analytics <- rbind(churned_analytics,healthy_analytics)


 ## ANALYTICS ACTIONS CDF
 analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Analytics Actions",y="Percent of Users")


 ###############################################################
 ## MOBILE UPDATES
 ###############################################################

 ## IMPORT AND TIDY DATA
 healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T)
 churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T)

 names(healthy_mobile) <- c("user_id","action","count")
 names(churned_mobile) <- c("user_id","action","count")

 healthy_mobile$user_type <- as.factor('healthy')
 churned_mobile$user_type <- as.factor('churned')

 total_mobile <- rbind(churned_mobile,healthy_mobile)


 ## MOBILE UPDATES CDF
 mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) + 
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Updates",y="Percent of Users")




 ###############################################################
 ## ACTION COUNT ANALYSIS
 ###############################################################

 ## IMPORT AND TIDY DATA
 churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T)
 healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T)

 names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week",
                            "changed_schedule","used_extension","used_optimal_timing",
                            "facebook_profiles","linkedin_profiles","pinterest_profiles",
                            "twitter_profiles","team_members","total_updates","action","count")

 names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week",
                          "changed_schedule","used_extension","used_optimal_timing",
                          "facebook_profiles","linkedin_profiles","pinterest_profiles",
                          "twitter_profiles","team_members","total_updates","action","count")

 churn_actions$user_type <- as.factor("churned")
 healthy_actions$user_type <- as.factor("healthy")

 all_actions <- rbind(churn_actions,healthy_actions)


 ## TARGET CERTAIN ACTIONS
 target_actions <- c("dashboard loaded","dashboard profiles selected",
                    "dashboard schedule changed_schedule changed_time",
                    "dashboard schedule changed_schedule changed_day",
                    "dashboard schedule changed_schedule added time",
                    "dashboard analytics viewed basic",
                    "dashboard analytics viewed advanced",
                    "dashboard updates shared composer later",
                    "dashboard updates shared composer now",
                    "dashboard composer media added_photo",
                    "dashboard suggestions viewed",
                    "dashboard updates shared suggestions",
                    "dashboard updates shared composer schedule",
                    "dashboard viewed sent_posts",
                    "dashboard queue changed_update edited composer",
                    "dashboard queue changed_update edited inline",
                    "dashboard queue changed_update deleted",
                    "dashboard updates shared contributions",
                    "dashboard feeds viewed general",
                    "dashboard queue changed_update custom_scheduled",
                    "dashboard updates shared feeds",
                    "extension viewed","extension updates shared composer later",
                    "extension composer media added_photo",
                    "extension composer media added_link",
                    "share sent", "share cancelled")

 filtered_actions <- all_actions %>%
  filter(action %in% target_actions)

 ## CONVERT DATA FRAME FROM LONG TO WIDE
 actions_wide <- spread(filtered_actions,action,count)
 actions_wide[is.na(actions_wide)] <- 0
 actions_wide <- select(actions_wide,-total_updates)
 names(actions_wide) <- make.names(names(actions_wide))

 ###############################################################
 ## MACHINE LEARNING ALGORITHMS
 ###############################################################
 library(caret); library(rattle)

 inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F)
 training <- actions_wide[inTrain,]
 testing <- actions_wide[-inTrain,] 

 names(training) <- make.names(names(training))
 names(testing) <- make.names(names(testing))


 ###############################################################
 ## DECISION TREE
 ###############################################################
 library(rpart)
 treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class")

 ## PLOT DECISION TREE
 fancyRpartPlot(treeFit,cex=0.7)

 ## TEST DECISION TREE
 actions_wide$tree_predictions <- predict(treeFit,actions_wide)


 ###############################################################
 ## RANDOM FORESTS
 ###############################################################

 ## FIT RANDOM FORESTS MODEL
 rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T)
 save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda")

 ## FIT A SECOND MODEL, FOR KICKS
 rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update)))

 ## CHECK PREDICTIONS OF FIRST MODEL
 rf_predictions <- predict(rf_fit,testing)
 testing$rf_prediction_right <- rf_predictions == testing$user_type
 table(rf_predictions,testing$rf_prediction_right)

 ## CHECK PREDICTIONS OF SECOND MODEL
 rf2_predictions <- predict(rf_fit2,testing)
 testing$predRight2 <- rf2_predictions == testing$user_type
 table(rf2_predictions,testing$predRight2)

 ## SET PREDICTION COLUMN IN ORIGINAL DATA
 actions_wide$rf_prediction <- predict(rf_fit,actions_wide)

 ## PRESENT USERS, PREDICTIONS
 predicted_users <- actions_wide %>%
  select(-(days_with_update:team_members)) %>%
  select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>%
  filter(rf_prediction == "churned"  & user_type == "healthy")
	library(data.table); library(dplyr); library(tidyr)
	library(ggplot2); library(scales); library(grid); library(RColorBrewer)

	###############################################################
	## MOST COMMON ACTIONS DURING FIRST TWO WEEKS
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",")
	churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",")

	names(healthy_user_actions) <- c("user_id","action","count")
	names(churned_user_actions) <- c("user_id","action","count")


	## GROUP BY ACTION, BIND DATASETS
	healthy_by_action <- healthy_user_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))

	churned_by_action <- churned_user_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("churned"))

	total_by_action <- rbind(healthy_by_action,churned_by_action)
	save(total_by_action, file="~/Google Drive/total_by_action.Rda")

	## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER
	target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action
	target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action
	target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
	target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
	target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


	## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10)
	action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")


	action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	###############################################################
	## UPDATES BEHAVIOR
	###############################################################

	## IMPORT AND TIDY DATA
	churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T)
	healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T)

	names(churned_update_actions) <- c("user_id","action","count")
	names(healthy_update_actions) <- c("user_id","action","count")

	churned_update_actions$user_type <- as.factor("churned")
	healthy_update_actions$user_type <- as.factor("healthy")

	total_update_actions <- rbind(churned_update_actions,healthy_update_actions)


	## GROUP BY ACTION, BIND DATASETS
	healthy_updates_by_action <- healthy_update_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))


	churned_updates_by_action <- churned_update_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))

	total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action)
	save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda")


	## TARGET ACTIONS - FIND MOST COMMON ACTIONS
	target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action
	target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action
	target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action
	target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action
	target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action
	target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action
	target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
	target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
	target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


	## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS

	## CDF OF ACTIONS COUNT
	updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) +
	labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))

	###############################################################
	## INTERACTIONS PER UPDATE
	###############################################################

	## IMPORT AND TIDY DATA
	churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T)
	healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T)

	names(churned_interactions) <- c("user_id","service","interactions","updates")
	names(healthy_interactions) <- c("user_id","service","interactions","updates")

	churned_interactions$user_type <- as.factor("churned")
	healthy_interactions$user_type <- as.factor("healthy")

	total_interactions <- rbind(churned_interactions,healthy_interactions)
	total_interactions <- total_interactions %>%
	mutate(interaction_per_update = interactions/updates)


	## CDFS OF INTERACTIONS
	twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+
	labs(x="Interactions per Tweet",y="Percent of Users")


	facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+
	labs(x="Interactions per Facebook Post",y="Percent of Users")


	linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
	labs(x="Interactions per Linkedin Post",y="Percent of Users")


	google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
	labs(x="Interactions per Google+ Post",y="Percent of Users")



	###############################################################
	## EXTENSION UPDATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T)
	churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T)

	names(healthy_extension) <- c("user_id","action","count")
	names(churned_extension) <- c("user_id","action","count")

	healthy_extension$user_type <- as.factor("healthy")
	churned_extension$user_type <- as.factor("churned")

	total_extension_use <- rbind(healthy_extension,churned_extension)

	extension_by_user <- total_extension_use %>%
	group_by(user_id, user_type) %>%
	summarise(updates_shared = sum(count))


	## CDF OF EXTENSION USAGE

	extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) +
	scale_y_continuous(breaks=seq(0,1.00,0.2)) +
	labs(x="Updates Shared From Extension",y="Percent of Users")


	###############################################################
	## NUMBER OF PROFILES CONNECTED
	###############################################################


	## IMPORT AND TIDY DATA
	churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T)
	healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T)

	names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
	names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")

	churned_profiles$user_type <- as.factor("churned")
	healthy_profiles$user_type <- as.factor("healthy")

	total_profiles <- rbind(churned_profiles,healthy_profiles)
	total_profiles <- total_profiles %>%
	mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles)


	## CDF OF PROFILES
	profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Profiles",y="Percent of Users")


	twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Twitter Profiles",y="Percent of Users")


	fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Facebook Profiles",y="Percent of Users")


	linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Linkedin Profiles",y="Percent of Users")


	###############################################################
	## TEAMMATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T)
	churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T)

	names(healthy_teammates) <- c("user_id","teammates")
	names(churned_teammates) <- c("user_id","teammates")

	healthy_teammates$user_type <- as.factor('healthy')
	churned_teammates$user_type <- as.factor('churned')

	total_teammates <- rbind(churned_teammates,healthy_teammates)


	## MOBILE UPDATES CDF
	teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Teammates",y="Percent of Users")


	###############################################################
	## ANALYTICS ACTIONS
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T)
	churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T)

	names(healthy_analytics) <- c("user_id","action_count")
	names(churned_analytics) <- c("user_id","action_count")

	healthy_analytics$user_type <- as.factor('healthy')
	churned_analytics$user_type <- as.factor('churned')

	total_analytics <- rbind(churned_analytics,healthy_analytics)


	## ANALYTICS ACTIONS CDF
	analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Analytics Actions",y="Percent of Users")


	###############################################################
	## MOBILE UPDATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T)
	churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T)

	names(healthy_mobile) <- c("user_id","action","count")
	names(churned_mobile) <- c("user_id","action","count")

	healthy_mobile$user_type <- as.factor('healthy')
	churned_mobile$user_type <- as.factor('churned')

	total_mobile <- rbind(churned_mobile,healthy_mobile)


	## MOBILE UPDATES CDF
	mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Updates",y="Percent of Users")




	###############################################################
	## ACTION COUNT ANALYSIS
	###############################################################

	## IMPORT AND TIDY DATA
	churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T)
	healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T)

	names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week",
	"changed_schedule","used_extension","used_optimal_timing",
	"facebook_profiles","linkedin_profiles","pinterest_profiles",
	"twitter_profiles","team_members","total_updates","action","count")

	names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week",
	"changed_schedule","used_extension","used_optimal_timing",
	"facebook_profiles","linkedin_profiles","pinterest_profiles",
	"twitter_profiles","team_members","total_updates","action","count")

	churn_actions$user_type <- as.factor("churned")
	healthy_actions$user_type <- as.factor("healthy")

	all_actions <- rbind(churn_actions,healthy_actions)


	## TARGET CERTAIN ACTIONS
	target_actions <- c("dashboard loaded","dashboard profiles selected",
	"dashboard schedule changed_schedule changed_time",
	"dashboard schedule changed_schedule changed_day",
	"dashboard schedule changed_schedule added time",
	"dashboard analytics viewed basic",
	"dashboard analytics viewed advanced",
	"dashboard updates shared composer later",
	"dashboard updates shared composer now",
	"dashboard composer media added_photo",
	"dashboard suggestions viewed",
	"dashboard updates shared suggestions",
	"dashboard updates shared composer schedule",
	"dashboard viewed sent_posts",
	"dashboard queue changed_update edited composer",
	"dashboard queue changed_update edited inline",
	"dashboard queue changed_update deleted",
	"dashboard updates shared contributions",
	"dashboard feeds viewed general",
	"dashboard queue changed_update custom_scheduled",
	"dashboard updates shared feeds",
	"extension viewed","extension updates shared composer later",
	"extension composer media added_photo",
	"extension composer media added_link",
	"share sent", "share cancelled")

	filtered_actions <- all_actions %>%
	filter(action %in% target_actions)

	## CONVERT DATA FRAME FROM LONG TO WIDE
	actions_wide <- spread(filtered_actions,action,count)
	actions_wide[is.na(actions_wide)] <- 0
	actions_wide <- select(actions_wide,-total_updates)
	names(actions_wide) <- make.names(names(actions_wide))

	###############################################################
	## MACHINE LEARNING ALGORITHMS
	###############################################################
	library(caret); library(rattle)

	inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F)
	training <- actions_wide[inTrain,]
	testing <- actions_wide[-inTrain,]

	names(training) <- make.names(names(training))
	names(testing) <- make.names(names(testing))


	###############################################################
	## DECISION TREE
	###############################################################
	library(rpart)
	treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class")

	## PLOT DECISION TREE
	fancyRpartPlot(treeFit,cex=0.7)

	## TEST DECISION TREE
	actions_wide$tree_predictions <- predict(treeFit,actions_wide)


	###############################################################
	## RANDOM FORESTS
	###############################################################

	## FIT RANDOM FORESTS MODEL
	rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T)
	save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda")

	## FIT A SECOND MODEL, FOR KICKS
	rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update)))

	## CHECK PREDICTIONS OF FIRST MODEL
	rf_predictions <- predict(rf_fit,testing)
	testing$rf_prediction_right <- rf_predictions == testing$user_type
	table(rf_predictions,testing$rf_prediction_right)

	## CHECK PREDICTIONS OF SECOND MODEL
	rf2_predictions <- predict(rf_fit2,testing)
	testing$predRight2 <- rf2_predictions == testing$user_type
	table(rf2_predictions,testing$predRight2)

	## SET PREDICTION COLUMN IN ORIGINAL DATA
	actions_wide$rf_prediction <- predict(rf_fit,actions_wide)

	## PRESENT USERS, PREDICTIONS
	predicted_users <- actions_wide %>%
	select(-(days_with_update:team_members)) %>%
	select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>%
	filter(rf_prediction == "churned" & user_type == "healthy")