Skip to content

Instantly share code, notes, and snippets.

@zjuul
Last active December 26, 2020 15:37
Show Gist options
  • Save zjuul/cac3f6e210cc64938bb5740d2920897f to your computer and use it in GitHub Desktop.
Save zjuul/cac3f6e210cc64938bb5740d2920897f to your computer and use it in GitHub Desktop.
# From: https://stuifbergen.com/2018/03/cohort-analysis-with-snowplow-and-r/
########## 1: get data
appname <- "NAME" # used as selection criterium, replace that with the name of your app
q <- paste0("
SELECT
user_id,
TO_CHAR(CONVERT_TIMEZONE('UTC', 'Europe/Amsterdam', derived_tstamp),'YYYY ww') AS yw
FROM atomic.events
WHERE app_id = '", appname, "' AND derived_tstamp >= '2018-01-01'
GROUP BY 1,2;
")
conn <- dbConnect(driver, url) # make the database connection - this is from the RJDBC package (library(RJDBC))
dbdata <- dbGetQuery(conn, q) # get the data
dbDisconnect(conn) # close the connection
########## 2: make cohort table
library(tidyverse)
cohort <- dbdata %>% # store in cohort table, get from dbdata
group_by(user_id) %>% # group all users together
mutate(first = min(yw)) %>% # for every user, find the first period
group_by(first, yw) %>% # group by this first period + the other periods
summarise(users = n()) %>% # for each combination, count the number of users
spread(yw, users) # and make columns with period names
########## 3: make pretty
shiftrow <- function(v) {
# put a vector in, strip off leading NA values, and place that amount at the end
first_na_index <- min( which(!is.na(v)) )
# return that bit to the end, and pad with NAs.
c(v[first_na_index:length(v)], rep(NA, first_na_index-1))
}
# create a new dataframe, with shifted rows (and keep the first one)
shifted <- data.frame(
cohort = cohort$first,
t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end
1, # for every row
shiftrow ))
)
# and make column names readable
# first should be "cohort" and the rest week.<number>, (padded)
colnames(shifted) <- c("cohort", sub("","week.", str_pad(1:(ncol(shifted)-1),2,pad = "0")))
# percentages
shifted_pct <- data.frame(
cohort = shifted$cohort, # first column
shifted[,2:nrow(shifted)+1] / shifted[["week.01"]] # rest: divide by week.01
)
######### 4: prepare plot data
# ggplot loves long data. Let's melt it. One for the absolute values, one for the pcts
plotdata_abs <- gather(shifted, "cohort_age", "people" ,2:ncol(shifted ))
plotdata_pct <- gather(shifted_pct, "cohort_age", "percent" ,2:ncol(shifted_pct))
# now add some data.. we need pretty labels..
# first bit is the length of the width of the wide column (minus 1, that's the cohort name)
# that contains the absolute numbers
# last bit is the rest, those are percentages.
labelnames <- c( plotdata_abs$people[1:(ncol(shifted)-1)],
plotdata_pct$percent[(ncol(shifted)):(nrow(plotdata_pct))])
# we need pretty labels.
pretty_print <- function(n) {
case_when( n <= 1 ~ sprintf("%1.0f %%", n*100),
n > 1 ~ as.character(n),
TRUE ~ " ") # for NA values, skip the label
}
# create the plot data
plotdata <- data.frame(
cohort = plotdata_pct$cohort,
cohort_age = plotdata_pct$cohort_age,
percentage = plotdata_pct$percent,
label = pretty_print(labelnames)
)
######### 5: plot!
# plot (with reordered y axis, oldest group on top)
# optional: if the percentages are really low, replace the 1.0 in the first column with zero
plotdata[which(plotdata$percentage == 1), "percentage"] <- 0
ggplot(plotdata, aes(x = cohort_age, y = reorder(cohort, desc(cohort)))) +
geom_raster(aes(fill = percentage)) +
scale_fill_continuous(guide = FALSE) + # no legend
geom_text(aes(label = label), color = "white") +
xlab("cohort age") + ylab("cohort") +
ggtitle(paste("Retention table (cohort) for",appname, "app"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment