Created
September 7, 2017 13:05
-
-
Save pengelbrecht/a425cba53f45eb16c705c4eef248b849 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(lubridate) # needs >= v1.6.0.9 for correct handling of lubridate.week.start with floor_date | |
| options(lubridate.week.start = 1) # Weeks start on Mondays, use 7 for Sundays | |
| library(dplyr) | |
| library(ggplot2) | |
| library(scales) | |
| library(glue) | |
| source("../analytics/utils/db.R", local = FALSE) | |
| query <- function(sql, db = mainframe_db) {return(collect(tbl(db, sql(sql))))} | |
| period_diff <- function(start, end, unit = "month") { | |
| start <- floor_date(start, unit = unit) | |
| end <- floor_date(end, unit = unit) | |
| interval <- start %--% end | |
| if(unit == "week") return(interval %/% weeks(1)) | |
| else if(unit == "month") return(interval %/% months(1)) | |
| else if(unit == "quarter") return((interval %/% months(1))/3) | |
| else if(unit == "year") return(interval %/% years(1)) | |
| else return(NA) | |
| } | |
| periodify <- function(revenue_data, unit = "month") { | |
| current_period <- floor_date(today(), unit) | |
| periodified_data <- revenue_data %>% | |
| mutate(period = floor_date(date, unit)) %>% | |
| group_by(customer_id, period) %>% | |
| summarize(amount = sum(amount)) %>% | |
| filter(period < current_period) # remove current (incomplete) period | |
| attr(periodified_data, "period") = unit | |
| return(periodified_data) | |
| } | |
| monthly_aggregate_churn_scalar <- function(start_count, current_count, age, unit, start_at = 1) { | |
| periods <- age - start_at | |
| per_period_retention <- (current_count/start_count)^(1/periods) | |
| monthly_retention <- case_when( | |
| unit == "month" ~ per_period_retention, | |
| unit == "week" ~ per_period_retention^(52/12), | |
| unit == "quarter" ~ per_period_retention^(1/3), | |
| unit == "year" ~ per_period_retention^(1/12) | |
| ) | |
| return(1-monthly_retention) | |
| } | |
| monthly_aggregate_churn <- Vectorize(monthly_aggregate_churn_scalar) | |
| # revenue_data: date, customer_id, amount | |
| cohortify <- function(revenue_data, unit = "month", cohort_retention_base_period = 2) { | |
| if(!is.Date(revenue_data$date)) revenue_data$date <- ymd(revenue_data$date) | |
| periodified_data <- periodify(revenue_data, unit = unit) | |
| cohorts <- periodified_data %>% group_by(customer_id) %>% summarize(cohort = min(period)) | |
| periodified_data <- periodified_data %>% inner_join(cohorts, by = "customer_id") | |
| cohortified_data <- periodified_data %>% | |
| group_by(cohort, period) %>% | |
| summarize(amount = sum(amount), count = n_distinct(customer_id)) %>% | |
| mutate( | |
| age = period_diff(cohort, period, unit) + 1, | |
| count_churn = 1 - count / lag(count), | |
| amount_churn = 1 - amount / lag(amount), | |
| cohort_count = first(count), | |
| cohort_amount = nth(amount, 2), | |
| avg_amount_per_cohort_company = amount / cohort_count, | |
| cum_avg_amount = cumsum(avg_amount_per_cohort_company), | |
| count_retention = count / cohort_count, | |
| amount_retention = amount / nth(amount, cohort_retention_base_period), | |
| agg_count_churn = monthly_aggregate_churn(cohort_count, count, age, unit), | |
| agg_amount_churn = monthly_aggregate_churn(cohort_amount, amount, age, unit), | |
| ) %>% ungroup() | |
| attr(cohortified_data, "period") = unit | |
| return(cohortified_data) | |
| } | |
| remove_outliers <- function(cohort_data, percentile = 0.05) { | |
| cohort_sizes <- cohort_data %>% group_by(cohort) %>% summarize(cohort_size = first(count)) %>% pull(cohort_size) | |
| limit <- quantile(cohort_sizes, percentile) | |
| filtered <- cohort_data %>% filter(cohort_count > limit) | |
| } | |
| human_numbers <- function(x = NULL, smbl ="", signif = 1){ | |
| humanity <- function(y){ | |
| if (!is.na(y)){ | |
| tn <- round(abs(y) / 1e12, signif) | |
| b <- round(abs(y) / 1e9, signif) | |
| m <- round(abs(y) / 1e6, signif) | |
| k <- round(abs(y) / 1e3, signif) | |
| if ( y >= 0 ){ | |
| y_is_positive <- "" | |
| } else { | |
| y_is_positive <- "-" | |
| } | |
| if ( k < 1 ) { | |
| paste0( y_is_positive, smbl, round(abs(y), signif )) | |
| } else if ( m < 1){ | |
| paste0 (y_is_positive, smbl, k , "K") | |
| } else if (b < 1){ | |
| paste0 (y_is_positive, smbl, m ,"M") | |
| }else if(tn < 1){ | |
| paste0 (y_is_positive, smbl, b ,"B") | |
| } else { | |
| paste0 (y_is_positive, smbl, comma(tn), "T") | |
| } | |
| } else if (is.na(y) | is.null(y)){ | |
| "-" | |
| } | |
| } | |
| sapply(x,humanity) | |
| } | |
| #' Human versions of large currency numbers - extensible via smbl | |
| human_gbp <- function(x){human_numbers(x, smbl = "£")} | |
| human_usd <- function(x){human_numbers(x, smbl = "$")} | |
| human_euro <- function(x){human_numbers(x, smbl = "€")} | |
| human_num <- function(x){human_numbers(x, smbl = "")} | |
| per_cohort_monthly_churn <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| cohort_data %>% | |
| group_by(cohort) %>% | |
| filter(row_number() == n()) %>% | |
| mutate(monthly_churn = monthly_aggregate_churn(cohort_count, count, age, unit)) %>% | |
| select(cohort, monthly_churn) %>% | |
| ungroup() %>% filter(row_number() != n()) | |
| } | |
| # CHARTS | |
| theme_heatmap <- function() { | |
| return( | |
| theme_bw() + | |
| theme( | |
| legend.position = "none", | |
| panel.grid.major = element_blank(), | |
| panel.grid.minor = element_blank(), | |
| panel.border = element_blank(), | |
| axis.ticks = element_blank() | |
| ) | |
| ) | |
| } | |
| cohort_retention_line_plot <- function(cohort_data) { | |
| ggplot(cohort_data, aes(x = age, y=amount, group = cohort, color = factor(cohort))) + | |
| geom_line(size=1) + | |
| geom_point() + | |
| xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Revenue")) + labs(color = "Cohort") + theme_bw() + | |
| scale_y_continuous(label = human_numbers) | |
| } | |
| avg_churn_plot <- function(cohort_data) { | |
| churn <- cohort_data %>% group_by(age) %>% summarize(avg_churn = mean(count_churn)) %>% filter(!is.na(avg_churn)) | |
| ggplot(churn, aes(x = age, y = avg_churn)) + geom_point() + geom_smooth() + scale_y_continuous(labels=scales::percent) + | |
| xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Customer Churn (", attr(cohort_data, "period"), "s)")) + theme_bw() + expand_limits(y = 0) | |
| } | |
| agg_count_churn_cohort_plot <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| churn_data <- cohort_data %>% remove_outliers() %>% per_cohort_monthly_churn() | |
| ggplot(churn_data, aes(x = cohort, y = monthly_churn)) + | |
| geom_line(color = "#31775a", size = 1) + | |
| scale_y_continuous(labels=scales::percent) + | |
| xlab("Cohort") + ylab("Average Monthly Churn") + theme_bw() + | |
| expand_limits(y = 0) | |
| } | |
| agg_count_churn_age_plot <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_count_churn)) %>% filter(row_number() > 1) | |
| ggplot(churn_data, aes(x = age, y = avg_churn)) + | |
| geom_line(color = "#31775a", size = 1) + | |
| scale_y_continuous(labels=scales::percent) + | |
| xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() + | |
| expand_limits(y = 0) | |
| } | |
| agg_amount_churn_age_plot <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_amount_churn)) %>% filter(row_number() > 1) | |
| ggplot(churn_data, aes(x = age, y = avg_churn)) + | |
| geom_line(color = "#31775a", size = 1) + | |
| scale_y_continuous(labels=scales::percent) + | |
| xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() + | |
| expand_limits(y = 0) | |
| } | |
| revenue_heatmap <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount)) + | |
| geom_text(aes(label = human_numbers(amount, signif = 0)), size = 4) + | |
| scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() + | |
| scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) + | |
| scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) + | |
| xlab(paste0("Age (", unit, "s)")) + ylab("Cohort") | |
| } | |
| relative_retention_heatmap <- function(cohort_data) { | |
| unit <- attr(cohort_data, "period") | |
| cohort_data <- cohort_data %>% filter(!is.na(amount_retention)) | |
| ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount_retention)) + | |
| geom_text(aes(label = sprintf("%1.0f%%", 100*amount_retention)), size = 4) + | |
| scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() + | |
| scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) + | |
| scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) + | |
| xlab(paste0("Age (", unit, "s)")) + ylab("Cohort") | |
| } | |
| per_active_customer_revenue_table <- function(cohort_data) { | |
| ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount/count)) + | |
| geom_text(aes(label = human_numbers(amount/count)), size = 2) + | |
| scale_fill_gradient(low = "white", high = "darkgreen") + theme_bw() + | |
| #theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") + | |
| theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") + | |
| #scale_y_date(date_breaks = "3 months", date_minor_breaks = "1 month") + | |
| xlab("Age") + labs(fill = "Revenue") | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment