Created
September 7, 2017 13:05
-
-
Save pengelbrecht/a425cba53f45eb16c705c4eef248b849 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(lubridate) # needs >= v1.6.0.9 for correct handling of lubridate.week.start with floor_date | |
options(lubridate.week.start = 1) # Weeks start on Mondays, use 7 for Sundays | |
library(dplyr) | |
library(ggplot2) | |
library(scales) | |
library(glue) | |
source("../analytics/utils/db.R", local = FALSE) | |
query <- function(sql, db = mainframe_db) {return(collect(tbl(db, sql(sql))))} | |
period_diff <- function(start, end, unit = "month") { | |
start <- floor_date(start, unit = unit) | |
end <- floor_date(end, unit = unit) | |
interval <- start %--% end | |
if(unit == "week") return(interval %/% weeks(1)) | |
else if(unit == "month") return(interval %/% months(1)) | |
else if(unit == "quarter") return((interval %/% months(1))/3) | |
else if(unit == "year") return(interval %/% years(1)) | |
else return(NA) | |
} | |
periodify <- function(revenue_data, unit = "month") { | |
current_period <- floor_date(today(), unit) | |
periodified_data <- revenue_data %>% | |
mutate(period = floor_date(date, unit)) %>% | |
group_by(customer_id, period) %>% | |
summarize(amount = sum(amount)) %>% | |
filter(period < current_period) # remove current (incomplete) period | |
attr(periodified_data, "period") = unit | |
return(periodified_data) | |
} | |
monthly_aggregate_churn_scalar <- function(start_count, current_count, age, unit, start_at = 1) { | |
periods <- age - start_at | |
per_period_retention <- (current_count/start_count)^(1/periods) | |
monthly_retention <- case_when( | |
unit == "month" ~ per_period_retention, | |
unit == "week" ~ per_period_retention^(52/12), | |
unit == "quarter" ~ per_period_retention^(1/3), | |
unit == "year" ~ per_period_retention^(1/12) | |
) | |
return(1-monthly_retention) | |
} | |
monthly_aggregate_churn <- Vectorize(monthly_aggregate_churn_scalar) | |
# revenue_data: date, customer_id, amount | |
cohortify <- function(revenue_data, unit = "month", cohort_retention_base_period = 2) { | |
if(!is.Date(revenue_data$date)) revenue_data$date <- ymd(revenue_data$date) | |
periodified_data <- periodify(revenue_data, unit = unit) | |
cohorts <- periodified_data %>% group_by(customer_id) %>% summarize(cohort = min(period)) | |
periodified_data <- periodified_data %>% inner_join(cohorts, by = "customer_id") | |
cohortified_data <- periodified_data %>% | |
group_by(cohort, period) %>% | |
summarize(amount = sum(amount), count = n_distinct(customer_id)) %>% | |
mutate( | |
age = period_diff(cohort, period, unit) + 1, | |
count_churn = 1 - count / lag(count), | |
amount_churn = 1 - amount / lag(amount), | |
cohort_count = first(count), | |
cohort_amount = nth(amount, 2), | |
avg_amount_per_cohort_company = amount / cohort_count, | |
cum_avg_amount = cumsum(avg_amount_per_cohort_company), | |
count_retention = count / cohort_count, | |
amount_retention = amount / nth(amount, cohort_retention_base_period), | |
agg_count_churn = monthly_aggregate_churn(cohort_count, count, age, unit), | |
agg_amount_churn = monthly_aggregate_churn(cohort_amount, amount, age, unit), | |
) %>% ungroup() | |
attr(cohortified_data, "period") = unit | |
return(cohortified_data) | |
} | |
remove_outliers <- function(cohort_data, percentile = 0.05) { | |
cohort_sizes <- cohort_data %>% group_by(cohort) %>% summarize(cohort_size = first(count)) %>% pull(cohort_size) | |
limit <- quantile(cohort_sizes, percentile) | |
filtered <- cohort_data %>% filter(cohort_count > limit) | |
} | |
human_numbers <- function(x = NULL, smbl ="", signif = 1){ | |
humanity <- function(y){ | |
if (!is.na(y)){ | |
tn <- round(abs(y) / 1e12, signif) | |
b <- round(abs(y) / 1e9, signif) | |
m <- round(abs(y) / 1e6, signif) | |
k <- round(abs(y) / 1e3, signif) | |
if ( y >= 0 ){ | |
y_is_positive <- "" | |
} else { | |
y_is_positive <- "-" | |
} | |
if ( k < 1 ) { | |
paste0( y_is_positive, smbl, round(abs(y), signif )) | |
} else if ( m < 1){ | |
paste0 (y_is_positive, smbl, k , "K") | |
} else if (b < 1){ | |
paste0 (y_is_positive, smbl, m ,"M") | |
}else if(tn < 1){ | |
paste0 (y_is_positive, smbl, b ,"B") | |
} else { | |
paste0 (y_is_positive, smbl, comma(tn), "T") | |
} | |
} else if (is.na(y) | is.null(y)){ | |
"-" | |
} | |
} | |
sapply(x,humanity) | |
} | |
#' Human versions of large currency numbers - extensible via smbl | |
human_gbp <- function(x){human_numbers(x, smbl = "£")} | |
human_usd <- function(x){human_numbers(x, smbl = "$")} | |
human_euro <- function(x){human_numbers(x, smbl = "€")} | |
human_num <- function(x){human_numbers(x, smbl = "")} | |
per_cohort_monthly_churn <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
cohort_data %>% | |
group_by(cohort) %>% | |
filter(row_number() == n()) %>% | |
mutate(monthly_churn = monthly_aggregate_churn(cohort_count, count, age, unit)) %>% | |
select(cohort, monthly_churn) %>% | |
ungroup() %>% filter(row_number() != n()) | |
} | |
# CHARTS | |
theme_heatmap <- function() { | |
return( | |
theme_bw() + | |
theme( | |
legend.position = "none", | |
panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.border = element_blank(), | |
axis.ticks = element_blank() | |
) | |
) | |
} | |
cohort_retention_line_plot <- function(cohort_data) { | |
ggplot(cohort_data, aes(x = age, y=amount, group = cohort, color = factor(cohort))) + | |
geom_line(size=1) + | |
geom_point() + | |
xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Revenue")) + labs(color = "Cohort") + theme_bw() + | |
scale_y_continuous(label = human_numbers) | |
} | |
avg_churn_plot <- function(cohort_data) { | |
churn <- cohort_data %>% group_by(age) %>% summarize(avg_churn = mean(count_churn)) %>% filter(!is.na(avg_churn)) | |
ggplot(churn, aes(x = age, y = avg_churn)) + geom_point() + geom_smooth() + scale_y_continuous(labels=scales::percent) + | |
xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Customer Churn (", attr(cohort_data, "period"), "s)")) + theme_bw() + expand_limits(y = 0) | |
} | |
agg_count_churn_cohort_plot <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
churn_data <- cohort_data %>% remove_outliers() %>% per_cohort_monthly_churn() | |
ggplot(churn_data, aes(x = cohort, y = monthly_churn)) + | |
geom_line(color = "#31775a", size = 1) + | |
scale_y_continuous(labels=scales::percent) + | |
xlab("Cohort") + ylab("Average Monthly Churn") + theme_bw() + | |
expand_limits(y = 0) | |
} | |
agg_count_churn_age_plot <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_count_churn)) %>% filter(row_number() > 1) | |
ggplot(churn_data, aes(x = age, y = avg_churn)) + | |
geom_line(color = "#31775a", size = 1) + | |
scale_y_continuous(labels=scales::percent) + | |
xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() + | |
expand_limits(y = 0) | |
} | |
agg_amount_churn_age_plot <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_amount_churn)) %>% filter(row_number() > 1) | |
ggplot(churn_data, aes(x = age, y = avg_churn)) + | |
geom_line(color = "#31775a", size = 1) + | |
scale_y_continuous(labels=scales::percent) + | |
xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() + | |
expand_limits(y = 0) | |
} | |
revenue_heatmap <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount)) + | |
geom_text(aes(label = human_numbers(amount, signif = 0)), size = 4) + | |
scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() + | |
scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) + | |
scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) + | |
xlab(paste0("Age (", unit, "s)")) + ylab("Cohort") | |
} | |
relative_retention_heatmap <- function(cohort_data) { | |
unit <- attr(cohort_data, "period") | |
cohort_data <- cohort_data %>% filter(!is.na(amount_retention)) | |
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount_retention)) + | |
geom_text(aes(label = sprintf("%1.0f%%", 100*amount_retention)), size = 4) + | |
scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() + | |
scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) + | |
scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) + | |
xlab(paste0("Age (", unit, "s)")) + ylab("Cohort") | |
} | |
per_active_customer_revenue_table <- function(cohort_data) { | |
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount/count)) + | |
geom_text(aes(label = human_numbers(amount/count)), size = 2) + | |
scale_fill_gradient(low = "white", high = "darkgreen") + theme_bw() + | |
#theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") + | |
theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") + | |
#scale_y_date(date_breaks = "3 months", date_minor_breaks = "1 month") + | |
xlab("Age") + labs(fill = "Revenue") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment