Last active
December 26, 2020 15:37
-
-
Save zjuul/cac3f6e210cc64938bb5740d2920897f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From: https://stuifbergen.com/2018/03/cohort-analysis-with-snowplow-and-r/ | |
########## 1: get data | |
appname <- "NAME" # used as selection criterium, replace that with the name of your app | |
q <- paste0(" | |
SELECT | |
user_id, | |
TO_CHAR(CONVERT_TIMEZONE('UTC', 'Europe/Amsterdam', derived_tstamp),'YYYY ww') AS yw | |
FROM atomic.events | |
WHERE app_id = '", appname, "' AND derived_tstamp >= '2018-01-01' | |
GROUP BY 1,2; | |
") | |
conn <- dbConnect(driver, url) # make the database connection - this is from the RJDBC package (library(RJDBC)) | |
dbdata <- dbGetQuery(conn, q) # get the data | |
dbDisconnect(conn) # close the connection | |
########## 2: make cohort table | |
library(tidyverse) | |
cohort <- dbdata %>% # store in cohort table, get from dbdata | |
group_by(user_id) %>% # group all users together | |
mutate(first = min(yw)) %>% # for every user, find the first period | |
group_by(first, yw) %>% # group by this first period + the other periods | |
summarise(users = n()) %>% # for each combination, count the number of users | |
spread(yw, users) # and make columns with period names | |
########## 3: make pretty | |
shiftrow <- function(v) { | |
# put a vector in, strip off leading NA values, and place that amount at the end | |
first_na_index <- min( which(!is.na(v)) ) | |
# return that bit to the end, and pad with NAs. | |
c(v[first_na_index:length(v)], rep(NA, first_na_index-1)) | |
} | |
# create a new dataframe, with shifted rows (and keep the first one) | |
shifted <- data.frame( | |
cohort = cohort$first, | |
t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end | |
1, # for every row | |
shiftrow )) | |
) | |
# and make column names readable | |
# first should be "cohort" and the rest week.<number>, (padded) | |
colnames(shifted) <- c("cohort", sub("","week.", str_pad(1:(ncol(shifted)-1),2,pad = "0"))) | |
# percentages | |
shifted_pct <- data.frame( | |
cohort = shifted$cohort, # first column | |
shifted[,2:nrow(shifted)+1] / shifted[["week.01"]] # rest: divide by week.01 | |
) | |
######### 4: prepare plot data | |
# ggplot loves long data. Let's melt it. One for the absolute values, one for the pcts | |
plotdata_abs <- gather(shifted, "cohort_age", "people" ,2:ncol(shifted )) | |
plotdata_pct <- gather(shifted_pct, "cohort_age", "percent" ,2:ncol(shifted_pct)) | |
# now add some data.. we need pretty labels.. | |
# first bit is the length of the width of the wide column (minus 1, that's the cohort name) | |
# that contains the absolute numbers | |
# last bit is the rest, those are percentages. | |
labelnames <- c( plotdata_abs$people[1:(ncol(shifted)-1)], | |
plotdata_pct$percent[(ncol(shifted)):(nrow(plotdata_pct))]) | |
# we need pretty labels. | |
pretty_print <- function(n) { | |
case_when( n <= 1 ~ sprintf("%1.0f %%", n*100), | |
n > 1 ~ as.character(n), | |
TRUE ~ " ") # for NA values, skip the label | |
} | |
# create the plot data | |
plotdata <- data.frame( | |
cohort = plotdata_pct$cohort, | |
cohort_age = plotdata_pct$cohort_age, | |
percentage = plotdata_pct$percent, | |
label = pretty_print(labelnames) | |
) | |
######### 5: plot! | |
# plot (with reordered y axis, oldest group on top) | |
# optional: if the percentages are really low, replace the 1.0 in the first column with zero | |
plotdata[which(plotdata$percentage == 1), "percentage"] <- 0 | |
ggplot(plotdata, aes(x = cohort_age, y = reorder(cohort, desc(cohort)))) + | |
geom_raster(aes(fill = percentage)) + | |
scale_fill_continuous(guide = FALSE) + # no legend | |
geom_text(aes(label = label), color = "white") + | |
xlab("cohort age") + ylab("cohort") + | |
ggtitle(paste("Retention table (cohort) for",appname, "app")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment