-
-
Save seabbs/027cd1c439e8acf1d598cc03ef33aaa4 to your computer and use it in GitHub Desktop.
# Load packages | |
library(brms) | |
library(cmdstanr) | |
library(data.table) # here we use the development version of data.table install it with data.table::update_dev_pkg | |
library(purrr) | |
# Set up parallel cores | |
options(mc.cores = 4) | |
# Simulate some truncated and truncation data | |
init_cases <- 100 | |
growth_rate <- 0.1 | |
max_t <- 20 | |
samples <- 400 | |
# note we actually won't end up with this many samples as some will be truncated | |
logmean <- 1.6 | |
logsd <- 0.6 | |
# Simulate the underlying outbreak structure assuming exponential growth | |
cases <- data.table( | |
cases = (init_cases * exp(growth_rate * (1:max_t))) |> | |
map_dbl(~ rpois(1, .)), | |
time = 1:max_t | |
) | |
plot(cases$cases) | |
# Make a case line list | |
linelist <- cases |> | |
DT(, .(id = 1:cases), by = time) | |
# Simulate the observation process for the line list | |
obs <- data.table( | |
time = sample(linelist$time, samples, replace = FALSE), | |
delay = rlnorm(samples, logmean, logsd) | |
) |> | |
# Add a new ID | |
DT(, id := 1:.N) |> | |
# When would data be observed | |
DT(, obs_delay := time + delay) |> | |
# Integerise delay | |
DT(, daily_delay := floor(delay)) |> | |
# Day after observations | |
DT(, day_after_delay := ceiling(delay)) |> | |
# Time observe for | |
DT(, obs_time := max_t - time) |> | |
# We don't know this exactly so need to censor | |
# Set to the midday point as average across day | |
DT(, censored_obs_time := obs_time - 0.5) |> | |
DT(, censored := "interval") | |
# Make event based data for latent modelling | |
obs <- obs |> | |
DT(, primary_event := floor(time)) |> | |
DT(, secondary_event := floor(obs_delay)) |> | |
DT(, max_t := max_t) | |
# Truncate observations | |
truncated_obs <- obs |> | |
DT(obs_delay <= max_t) | |
double_truncated_obs <- truncated_obs |> | |
# The lognormal family in brms does not support 0 so also truncate delays > 1 | |
# This seems like it could be improved | |
DT(daily_delay >= 1) | |
# Fit lognormal model with no corrections | |
naive_model <- brm( | |
bf(daily_delay ~ 1, sigma ~ 1), data = double_truncated_obs, | |
family = lognormal(), backend = "cmdstanr", adapt_delta = 0.9 | |
) | |
# We see that the log mean is truncated | |
# the sigma_intercept needs to be exponentiated to return the log sd | |
summary(naive_model) | |
# Adjust for truncation | |
trunc_model <- brm( | |
bf(daily_delay | trunc(lb = 1, ub = censored_obs_time) ~ 1, sigma ~ 1), | |
data = double_truncated_obs, family = lognormal(), | |
backend = "cmdstanr", adapt_delta = 0.9 | |
) | |
# Getting closer to recovering our simulated estimates | |
summary(trunc_model) | |
# Correct for censoring | |
censor_model <- brm( | |
bf(daily_delay | cens(censored, day_after_delay) ~ 1, sigma ~ 1), | |
data = double_truncated_obs, family = lognormal(), | |
backend = "cmdstanr", adapt_delta = 0.9 | |
) | |
# Less close than truncation but better than naive model | |
summary(censor_model) | |
# Correct for double interval censoring and truncation | |
censor_trunc_model <- brm( | |
bf( | |
daily_delay | trunc(lb = 1, ub = censored_obs_time) + | |
cens(censored, day_after_delay) ~ 1, | |
sigma ~ 1 | |
), | |
data = double_truncated_obs, family = lognormal(), backend = "cmdstanr" | |
) | |
# Recover underlying distribution | |
# As the growth rate increases and with short delays we may still see a bias | |
# as we have a censored observation time | |
summary(censor_trunc_model) | |
# Model censoring as a latent process (WIP) | |
# For this model we need to use a custom brms family and so | |
# the code is significantly more complex. | |
# Custom family for latent censoring and truncation | |
fit_latent_lognormal <- function(fn = brm, ...) { | |
latent_lognormal <- custom_family( | |
"latent_lognormal", | |
dpars = c("mu", "sigma", "pwindow", "swindow"), | |
links = c("identity", "log", "identity", "identity"), | |
lb = c(NA, 0, 0, 0), | |
ub = c(NA, NA, 1, 1), | |
type = "real", | |
vars = c("vreal1[n]", "vreal2[n]") | |
) | |
stan_funs <- " | |
real latent_lognormal_lpdf(real y, real mu, real sigma, real pwindow, | |
real swindow, real sevent, | |
real end_t) { | |
real p = y + pwindow; | |
real s = sevent + swindow; | |
real d = s - p; | |
real obs_time = end_t - p; | |
return lognormal_lpdf(d | mu, sigma) - lognormal_lcdf(obs_time | mu, sigma); | |
} | |
" | |
stanvars <- stanvar(block = "functions", scode = stan_funs) | |
# Set up shared priors ---------------------------------------------------- | |
priors <- c( | |
prior(uniform(0, 1), class = "b", dpar = "pwindow", lb = 0, ub = 1), | |
prior(uniform(0, 1), class = "b", dpar = "swindow", lb = 0, ub = 1) | |
) | |
fit <- fn(family = latent_lognormal, stanvars = stanvars, prior = priors, ...) | |
return(fit) | |
} | |
# Fit latent lognormal model | |
latent_model <- fit_latent_lognormal( | |
bf(primary_event | vreal(secondary_event, max_t) ~ 1, sigma ~ 1, | |
pwindow ~ 0 + as.factor(id), swindow ~ 0 + as.factor(id)), | |
data = truncated_obs, backend = "cmdstanr", fn = brm, | |
adapt_delta = 0.95 | |
) | |
# Should also see parameter recovery using this method though | |
# run-times are much higher and the model is somewhat unstable. | |
summary(latent_model) |
Nice - yes I agree its +-1 padding. Useful to phrase it as the flooring operation for both dates. I also like the use of bpmodels
as the simulator (I was actually just looking at this for the write-up).
So if we were dealing with a case where the primary event was happening during the growth phase and the secondary event was happening during the decay phase, we'll see the uniform bias pop out. Not sure how this works for the censoring without the latent case, but will work out simulations very very very soon (sorry, need to send something to Bryan this week).
This is a really interesting point @parksw3!
@sbfnk see a crazy named branch of that repo for some fleshing out of the analysis skeleton.
I merged both branches so you can also look at the main branch
I wrote this before reading @sbfnk's code and now realized why it's useful if people don't want to deal with the latent approach I'm confused why the censoring interval need to ever span 2 days. If all dates are reported correctly, such that
p_dis = floor(p_con)
ands_dis=floor(s_con)
, then the censoring interval for the first event isp_dis
top_dis+1
and for the second event iss_dis
tos_dis+1
regardless of what the delay is.Maybe I'm missing something, but I think framing censoring in terms ofp_dis
ands_dis
seems most clear to me rather than having to rely on the delay itself.When
p_dis - s_dis = 0
, then the bounds we want to put ons_con
is betweenp_lat
ands_dis+1
wherep_lat
is the latent variable estimator forp_con
. So we need a little more tweaking of thebrms
code than what we @seabbs initially wrote.I also chatted with Jonathan on this topic another day and think we figured out why uniform prior is bad and good. Let's think of a generic case where all individuals who experience the first event between time
p_1
andp_2
(neither of which need to be discrete values), get their primary event reported at timep_dis
. Then, for a cohort of individuals who report their event atp_dis
, the distribution of true event time should be proportional to the incidence between timep_1
andp_2
. Same goes for the second event. So far we've been dealing with the exponential cases with daily time steps, which is a special case of this (p_1=floor(p_con)
andp_2=p_1+1
) and assuming a uniform prior for the inference. In this case, bothp_lat
ands_lat
give biased estimates ofp_con
ands_con
in the same direction. In this case, biases cancel out such thats_lat - p_lat
give unbiased estimates ofs_con - p_con
!! So if we were dealing with a case where the primary event was happening during the growth phase and the secondary event was happening during the decay phase, we'll see the uniform bias pop out. Not sure how this works for the censoring without the latent case, but will work out simulations very very very soon (sorry, need to send something to Bryan this week).Also invited @sbfnk to the repo (https://github.com/parksw3/dynamicaltruncation).