Dpananos · October 11, 2024 04:42
diff --git a/kaplan_meir.sql b/kaplan_meir.sql
 SET VARIABLE EXPERIMENT_START_DATE = TIMESTAMP '2024-10-01 00:00:00';
 SET VARIABLE EXPERIMENT_END_DATE = TIMESTAMP '2024-10-31 00:00:00';


 with prep_cleaned_exposures as (
  select distinct
    userid, 
    treatment,
    min(exposure_time) over (partition by userid) as exposure_time,
    (min(treatment) over (partition by userid)) <> (max(treatment) over (partition by userid)) as multiple_exposures
  from experiment_000
  where 
    (
    exposure_time <= getvariable('EXPERIMENT_END_DATE')
    and
    exposure_time >= getvariable('EXPERIMENT_START_DATE')
    )
  order by treatment, userid
  
 )

 , cleaned_exposures as (select * from prep_cleaned_exposures where not multiple_exposures  )

 , exposure_balance as (select treatment, count(distinct userid) n_exposed from cleaned_exposures group by 1)

 , prep_experiment_outcomes as (
  select 
    a.userid, 
    a.treatment, 
    a.exposure_time, 
    b.observed_time as event_time, 
    c.n_exposed as at_risk
  from cleaned_exposures as a
  left join experiment_000 as b 
    on a.userid = b.userid and a.exposure_time <  coalesce(b.observed_time, getvariable('EXPERIMENT_END_DATE'))
  left join exposure_balance c on a.treatment = c.treatment
 )

 , cleaned_experiment_outcomes as (
  select
    userid, 
    treatment, 
    exposure_time, 
    event_time is not NULL as is_observed, 
    at_risk,
    coalesce(event_time, getvariable('EXPERIMENT_END_DATE') ) as observed_time,
    datediff('seconds', exposure_time, coalesce(event_time, getvariable('EXPERIMENT_END_DATE') )) * 1.0 / (86400) as event_time_days,
  from prep_experiment_outcomes
  order by treatment, userid
 )

 , prep_lifetable_1 as (

 select
  
  treatment, 
  case when not is_observed then floor(event_time_days) else floor(event_time_days) + 0.5 end as event_time,
  coalesce(count(distinct case when  is_observed then userid end), 0) as n_event,
  coalesce(count(distinct case when not is_observed then userid end), 0) as n_censor,
  at_risk as n_exposed
  
 from cleaned_experiment_outcomes
 group by 1, 2, 5
 order by 1, 2

 )

 , life_table as (

 select 
 *, 
 coalesce(lag(at_risk1, 1) over (partition by treatment order by event_time), n_exposed) as at_risk,
 n_event::float / at_risk::float as hazard
 from(
  select 
    *, 
    n_exposed - (sum(n_event + n_censor) over (partition by treatment order by event_time rows between unbounded preceding and current row)) as at_risk1
  from prep_lifetable_1
  order by 1, 2
 )

 )

 select 
  *, 
  exp(sum(ln(1-hazard)) over (partition by treatment order by event_time rows between unbounded preceding and current row)) as S
 from life_table
 order by 1, 2
	SET VARIABLE EXPERIMENT_START_DATE = TIMESTAMP '2024-10-01 00:00:00';
	SET VARIABLE EXPERIMENT_END_DATE = TIMESTAMP '2024-10-31 00:00:00';


	with prep_cleaned_exposures as (
	select distinct
	userid,
	treatment,
	min(exposure_time) over (partition by userid) as exposure_time,
	(min(treatment) over (partition by userid)) <> (max(treatment) over (partition by userid)) as multiple_exposures
	from experiment_000
	where
	(
	exposure_time <= getvariable('EXPERIMENT_END_DATE')
	and
	exposure_time >= getvariable('EXPERIMENT_START_DATE')
	)
	order by treatment, userid

	)

	, cleaned_exposures as (select * from prep_cleaned_exposures where not multiple_exposures )

	, exposure_balance as (select treatment, count(distinct userid) n_exposed from cleaned_exposures group by 1)

	, prep_experiment_outcomes as (
	select
	a.userid,
	a.treatment,
	a.exposure_time,
	b.observed_time as event_time,
	c.n_exposed as at_risk
	from cleaned_exposures as a
	left join experiment_000 as b
	on a.userid = b.userid and a.exposure_time < coalesce(b.observed_time, getvariable('EXPERIMENT_END_DATE'))
	left join exposure_balance c on a.treatment = c.treatment
	)

	, cleaned_experiment_outcomes as (
	select
	userid,
	treatment,
	exposure_time,
	event_time is not NULL as is_observed,
	at_risk,
	coalesce(event_time, getvariable('EXPERIMENT_END_DATE') ) as observed_time,
	datediff('seconds', exposure_time, coalesce(event_time, getvariable('EXPERIMENT_END_DATE') )) * 1.0 / (86400) as event_time_days,
	from prep_experiment_outcomes
	order by treatment, userid
	)

	, prep_lifetable_1 as (

	select

	treatment,
	case when not is_observed then floor(event_time_days) else floor(event_time_days) + 0.5 end as event_time,
	coalesce(count(distinct case when is_observed then userid end), 0) as n_event,
	coalesce(count(distinct case when not is_observed then userid end), 0) as n_censor,
	at_risk as n_exposed

	from cleaned_experiment_outcomes
	group by 1, 2, 5
	order by 1, 2

	)

	, life_table as (

	select
	*,
	coalesce(lag(at_risk1, 1) over (partition by treatment order by event_time), n_exposed) as at_risk,
	n_event::float / at_risk::float as hazard
	from(
	select
	*,
	n_exposed - (sum(n_event + n_censor) over (partition by treatment order by event_time rows between unbounded preceding and current row)) as at_risk1
	from prep_lifetable_1
	order by 1, 2
	)

	)

	select
	*,
	exp(sum(ln(1-hazard)) over (partition by treatment order by event_time rows between unbounded preceding and current row)) as S
	from life_table
	order by 1, 2