Skip to content

Instantly share code, notes, and snippets.

@pengelbrecht
Created September 7, 2017 13:05
Show Gist options
  • Save pengelbrecht/a425cba53f45eb16c705c4eef248b849 to your computer and use it in GitHub Desktop.
Save pengelbrecht/a425cba53f45eb16c705c4eef248b849 to your computer and use it in GitHub Desktop.
library(lubridate) # needs >= v1.6.0.9 for correct handling of lubridate.week.start with floor_date
options(lubridate.week.start = 1) # Weeks start on Mondays, use 7 for Sundays
library(dplyr)
library(ggplot2)
library(scales)
library(glue)
source("../analytics/utils/db.R", local = FALSE)
query <- function(sql, db = mainframe_db) {return(collect(tbl(db, sql(sql))))}
period_diff <- function(start, end, unit = "month") {
start <- floor_date(start, unit = unit)
end <- floor_date(end, unit = unit)
interval <- start %--% end
if(unit == "week") return(interval %/% weeks(1))
else if(unit == "month") return(interval %/% months(1))
else if(unit == "quarter") return((interval %/% months(1))/3)
else if(unit == "year") return(interval %/% years(1))
else return(NA)
}
periodify <- function(revenue_data, unit = "month") {
current_period <- floor_date(today(), unit)
periodified_data <- revenue_data %>%
mutate(period = floor_date(date, unit)) %>%
group_by(customer_id, period) %>%
summarize(amount = sum(amount)) %>%
filter(period < current_period) # remove current (incomplete) period
attr(periodified_data, "period") = unit
return(periodified_data)
}
monthly_aggregate_churn_scalar <- function(start_count, current_count, age, unit, start_at = 1) {
periods <- age - start_at
per_period_retention <- (current_count/start_count)^(1/periods)
monthly_retention <- case_when(
unit == "month" ~ per_period_retention,
unit == "week" ~ per_period_retention^(52/12),
unit == "quarter" ~ per_period_retention^(1/3),
unit == "year" ~ per_period_retention^(1/12)
)
return(1-monthly_retention)
}
monthly_aggregate_churn <- Vectorize(monthly_aggregate_churn_scalar)
# revenue_data: date, customer_id, amount
cohortify <- function(revenue_data, unit = "month", cohort_retention_base_period = 2) {
if(!is.Date(revenue_data$date)) revenue_data$date <- ymd(revenue_data$date)
periodified_data <- periodify(revenue_data, unit = unit)
cohorts <- periodified_data %>% group_by(customer_id) %>% summarize(cohort = min(period))
periodified_data <- periodified_data %>% inner_join(cohorts, by = "customer_id")
cohortified_data <- periodified_data %>%
group_by(cohort, period) %>%
summarize(amount = sum(amount), count = n_distinct(customer_id)) %>%
mutate(
age = period_diff(cohort, period, unit) + 1,
count_churn = 1 - count / lag(count),
amount_churn = 1 - amount / lag(amount),
cohort_count = first(count),
cohort_amount = nth(amount, 2),
avg_amount_per_cohort_company = amount / cohort_count,
cum_avg_amount = cumsum(avg_amount_per_cohort_company),
count_retention = count / cohort_count,
amount_retention = amount / nth(amount, cohort_retention_base_period),
agg_count_churn = monthly_aggregate_churn(cohort_count, count, age, unit),
agg_amount_churn = monthly_aggregate_churn(cohort_amount, amount, age, unit),
) %>% ungroup()
attr(cohortified_data, "period") = unit
return(cohortified_data)
}
remove_outliers <- function(cohort_data, percentile = 0.05) {
cohort_sizes <- cohort_data %>% group_by(cohort) %>% summarize(cohort_size = first(count)) %>% pull(cohort_size)
limit <- quantile(cohort_sizes, percentile)
filtered <- cohort_data %>% filter(cohort_count > limit)
}
human_numbers <- function(x = NULL, smbl ="", signif = 1){
humanity <- function(y){
if (!is.na(y)){
tn <- round(abs(y) / 1e12, signif)
b <- round(abs(y) / 1e9, signif)
m <- round(abs(y) / 1e6, signif)
k <- round(abs(y) / 1e3, signif)
if ( y >= 0 ){
y_is_positive <- ""
} else {
y_is_positive <- "-"
}
if ( k < 1 ) {
paste0( y_is_positive, smbl, round(abs(y), signif ))
} else if ( m < 1){
paste0 (y_is_positive, smbl, k , "K")
} else if (b < 1){
paste0 (y_is_positive, smbl, m ,"M")
}else if(tn < 1){
paste0 (y_is_positive, smbl, b ,"B")
} else {
paste0 (y_is_positive, smbl, comma(tn), "T")
}
} else if (is.na(y) | is.null(y)){
"-"
}
}
sapply(x,humanity)
}
#' Human versions of large currency numbers - extensible via smbl
human_gbp <- function(x){human_numbers(x, smbl = "£")}
human_usd <- function(x){human_numbers(x, smbl = "$")}
human_euro <- function(x){human_numbers(x, smbl = "€")}
human_num <- function(x){human_numbers(x, smbl = "")}
per_cohort_monthly_churn <- function(cohort_data) {
unit <- attr(cohort_data, "period")
cohort_data %>%
group_by(cohort) %>%
filter(row_number() == n()) %>%
mutate(monthly_churn = monthly_aggregate_churn(cohort_count, count, age, unit)) %>%
select(cohort, monthly_churn) %>%
ungroup() %>% filter(row_number() != n())
}
# CHARTS
theme_heatmap <- function() {
return(
theme_bw() +
theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank()
)
)
}
cohort_retention_line_plot <- function(cohort_data) {
ggplot(cohort_data, aes(x = age, y=amount, group = cohort, color = factor(cohort))) +
geom_line(size=1) +
geom_point() +
xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Revenue")) + labs(color = "Cohort") + theme_bw() +
scale_y_continuous(label = human_numbers)
}
avg_churn_plot <- function(cohort_data) {
churn <- cohort_data %>% group_by(age) %>% summarize(avg_churn = mean(count_churn)) %>% filter(!is.na(avg_churn))
ggplot(churn, aes(x = age, y = avg_churn)) + geom_point() + geom_smooth() + scale_y_continuous(labels=scales::percent) +
xlab(paste0("Age (", attr(cohort_data, "period"), "s)")) + ylab(paste0("Customer Churn (", attr(cohort_data, "period"), "s)")) + theme_bw() + expand_limits(y = 0)
}
agg_count_churn_cohort_plot <- function(cohort_data) {
unit <- attr(cohort_data, "period")
churn_data <- cohort_data %>% remove_outliers() %>% per_cohort_monthly_churn()
ggplot(churn_data, aes(x = cohort, y = monthly_churn)) +
geom_line(color = "#31775a", size = 1) +
scale_y_continuous(labels=scales::percent) +
xlab("Cohort") + ylab("Average Monthly Churn") + theme_bw() +
expand_limits(y = 0)
}
agg_count_churn_age_plot <- function(cohort_data) {
unit <- attr(cohort_data, "period")
churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_count_churn)) %>% filter(row_number() > 1)
ggplot(churn_data, aes(x = age, y = avg_churn)) +
geom_line(color = "#31775a", size = 1) +
scale_y_continuous(labels=scales::percent) +
xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() +
expand_limits(y = 0)
}
agg_amount_churn_age_plot <- function(cohort_data) {
unit <- attr(cohort_data, "period")
churn_data <- cohort_data %>% remove_outliers() %>% group_by(age) %>% summarize(avg_churn = mean(agg_amount_churn)) %>% filter(row_number() > 1)
ggplot(churn_data, aes(x = age, y = avg_churn)) +
geom_line(color = "#31775a", size = 1) +
scale_y_continuous(labels=scales::percent) +
xlab(paste0("Age (", unit, "s)")) + ylab("Average Monthly Churn") + theme_bw() +
expand_limits(y = 0)
}
revenue_heatmap <- function(cohort_data) {
unit <- attr(cohort_data, "period")
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount)) +
geom_text(aes(label = human_numbers(amount, signif = 0)), size = 4) +
scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() +
scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) +
scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) +
xlab(paste0("Age (", unit, "s)")) + ylab("Cohort")
}
relative_retention_heatmap <- function(cohort_data) {
unit <- attr(cohort_data, "period")
cohort_data <- cohort_data %>% filter(!is.na(amount_retention))
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount_retention)) +
geom_text(aes(label = sprintf("%1.0f%%", 100*amount_retention)), size = 4) +
scale_fill_gradient(low = "white", high = "#31775a") + theme_heatmap() +
scale_x_continuous(breaks = seq(min(cohort_data$age), max(cohort_data$age), 1), expand = c(0, 0)) +
scale_y_date(breaks = sort(unique(cohort_data$cohort)), expand = c(0, 0)) +
xlab(paste0("Age (", unit, "s)")) + ylab("Cohort")
}
per_active_customer_revenue_table <- function(cohort_data) {
ggplot(data = cohort_data, aes(x = age, y = cohort)) + geom_tile(aes(fill = amount/count)) +
geom_text(aes(label = human_numbers(amount/count)), size = 2) +
scale_fill_gradient(low = "white", high = "darkgreen") + theme_bw() +
#theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") +
theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank(), plot.margin=unit(c(0,0,0,0), "cm"), legend.position = "none") +
#scale_y_date(date_breaks = "3 months", date_minor_breaks = "1 month") +
xlab("Age") + labs(fill = "Revenue")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment