jeffeaton · May 1, 2024 13:12 · jeffeaton · May 1, 2024
diff --git a/phia-incidence-extract_2024-04-25.R b/phia-incidence-extract_2024-04-25.R
 library(dplyr)
 library(readr)
 library(tidyr)
 library(forcats)
 library(haven)
 library(survey)

 ## devtools::install_github("jeffeaton/svyincid")
 library(svyincid)

 phia_path <- "~/Data/household surveys/PHIA/datasets"

 cmr2017phia <- rdhs::read_zipdata(file.path(phia_path, "CMR/datasets/302 CAMPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
 civ2018phia <- rdhs::read_zipdata(file.path(phia_path, "CIV/datasets/302 CIPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
 eth2018phia <- rdhs::read_zipdata(file.path(phia_path, "ETH/datasets/302 EPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
 ken2018phia <- rdhs::read_zipdata(file.path(phia_path, "KEN/datasets/KENPHIA 2018 Household Interview and Biomarker Datasets v1.1 (DTA).zip"), "kenphia2018adultbio.dta")
 lso2017phia <- rdhs::read_zipdata(file.path(phia_path, "LSO/datasets/302_LePHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
 mwi2016phia <- rdhs::read_zipdata(file.path(phia_path, "MWI/datasets/MPHIA 2015-2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "mphia2015adultbio.dta")
 nam2017phia <- rdhs::read_zipdata(file.path(phia_path, "NAM/datasets/302_NAMPHIA 2017 Adult Biomarker Dataset (DTA).zip"))
 rwa2019phia <- rdhs::read_zipdata(file.path(phia_path, "RWA/datasets/302 RPHIA 2018-2019 Adult Biomarker Dataset (DTA).zip"))
 swz2017phia <- rdhs::read_zipdata(file.path(phia_path, "SWZ/datasets/SHIMS2 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "shims22016adultbio.dta")
 tza2017phia <- rdhs::read_zipdata(file.path(phia_path, "TZA/datasets/THIS 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "this2016adultbio.dta")
 uga2017phia <- rdhs::read_zipdata(file.path(phia_path, "UGA/datasets/302 UPHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
 zmb2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZMB/datasets/ZAMPHIA 2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "zamphia2016adultbio.dta")
 zwe2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZWE/datasets/302_ZIMPHIA 2015-2016 Adult Biomarker Dataset (DTA).zip"))
 nga2021phia <- haven::read_dta("~/Data/household surveys/Nigeria/NAIIS/public/datasets/NAIIS2018adultbio.dta")

 #' ## PHIA2
 mwi2020phia <- haven::read_dta(unz(file.path(phia_path, "MWI2020PHIA/datasets/MPHIA 2020-2021 Household Interview and Biomarker Data (DTA).zip"), "mphia2020adultbio.dta"))
 lso2020phia <- haven::read_dta(unz(file.path(phia_path, "LSO2020PHIA/datasets/LePHIA 2020 Household (DTA).zip"), "lephia2020adultbio.dta"))
 moz2021phia <- haven::read_dta(unz(file.path(phia_path, "MOZ2021PHIA/datasets/INSIDA 2021 Household Interview and Biomarker Datasets (DTA).zip"), "insida2021adultbio.dta"))
 swz2021phia <- haven::read_dta(unz(file.path(phia_path, "SWZ2021PHIA/datasets/SHIMS3 2021 Household Interview and Biomarker Datasets (DTA).zip"), "shims32021adultbio.dta"))
 zwe2020phia <- haven::read_dta(unz(file.path(phia_path, "ZWE2020PHIA/datasets/ZIMPHIA 2020 Household (DTA).zip"), "zimphia2020adultbio.dta"))
 zmb2021phia <- haven::read_dta(unz(file.path(phia_path, "ZMB2021PHIA/ZAMPHIA 2021 PU Package.zip"), "zamphia2021adultbio.dta"))
 bwa2021phia <- haven::read_dta("~/Data/household surveys/Botswana/BAIS V/datasets/baisv2021adultbio.dta.zip")

 nga2021phia$centroidid <- sprintf("NG%04d%05d", nga2021phia$varstrat, nga2021phia$varunit)

 datl <- grep("[a-z]{3}20[0-9]{2}phia$", ls(), value = TRUE) %>%
  {setNames(lapply(., get), toupper(.))}

 dat <- datl %>%
  lapply(select, any_of(c("country", "householdid", "personid", "centroidid", "gender", "age", "hivstatusfinal", "recentlagvl", "recentlagvlarv", "btwt0"))) %>%
  lapply(haven::zap_labels) %>%
  Map(mutate, ., survey_id = names(.)) %>%
  bind_rows()

 phia_info <- tribble(
  ~survey_id,    ~iso3, ~country,        ~survey_year_label, ~survey_year, ~phia_phase,
  "CMR2017PHIA", "CMR", "Cameroon",      "2017",             2017,         "PHIA1",
  "CIV2018PHIA", "CIV", "Cote d'Ivoire", "2017-2018",        2018,         "PHIA1",
  "SWZ2017PHIA", "SWZ", "Eswatini",      "2016-2017",        2017,         "PHIA1",
  "ETH2018PHIA", "ETH", "Ethiopia",      "2017-2018",        2018,         "PHIA1",
  "KEN2018PHIA", "KEN", "Kenya",         "2018",             2018,         "PHIA1",
  "LSO2017PHIA", "LSO", "Lesotho",       "2016-2017",        2017,         "PHIA1",
  "MWI2016PHIA", "MWI", "Malawi",        "2015-2016",        2016,         "PHIA1",
  "NAM2017PHIA", "NAM", "Namibia",       "2017",             2017,         "PHIA1",
  "RWA2019PHIA", "RWA", "Rwanda",        "2018-2019",        2019,         "PHIA1",
  "TZA2017PHIA", "TZA", "Tanzania",      "2016-2017",        2017,         "PHIA1",
  "UGA2017PHIA", "UGA", "Uganda",        "2016-2017",        2017,         "PHIA1",
  "ZMB2016PHIA", "ZMB", "Zambia",        "2016",             2016,         "PHIA1",
  "ZWE2016PHIA", "ZWE", "Zimbabwe",      "2015-2016",        2016,         "PHIA1",
  "NGA2021PHIA", "NGA", "Nigeria",       "2018",             2018,         "PHIA1",
  ##                                                                               
  "LSO2020PHIA", "LSO", "Lesotho",       "2020",             2020,         "PHIA2",
  "MOZ2021PHIA", "MOZ", "Mozambique",    "2021",             2021,         "PHIA2",
  "MWI2020PHIA", "MWI", "Malawi",        "2020",             2020,         "PHIA2",
  "SWZ2021PHIA", "SWZ", "Eswatini",      "2021",             2021,         "PHIA2",
  "ZWE2020PHIA", "ZWE", "Zimbabwe",      "2020",             2020,         "PHIA2",
  "ZMB2021PHIA", "ZMB", "Zambia",        "2021",             2021,         "PHIA2",
  "BWA2021PHIA", "BWA", "Botswana",      "2021",             2021,         "PHIA2"
 )

 dat <- dat %>%
  select(-country) %>%
  left_join(phia_info)

 dat %>%
  count(survey_id, hivstatusfinal) %>%
  spread(hivstatusfinal, n) %>%
  print(n = Inf)

 dat <- dat %>%
  mutate(
    hivstatus
 dat %>%
  count(survey_id, recentlagvlarv) %>%
  spread(recentlagvlarv, n) %>%
  print(n = Inf)


 #' Recode the recent infection status as quadrinomial outcome:
 #' "0. negative":   HIV negative
 #' "1. recent":     HIV positive and recent HIV infection
 #' "2. long-term":  HIV positive and long-term infection (not recent)
 #' "9. not tested": HIV positive and no recent infection available (assumed missing-at-random)
 #'
 #' It must be a **factor variable with four levels** in this order.
 #' The value labels are not important, but the order is important.
 #' 

 dat <- dat %>%
  mutate(
    recent_prev = case_when(hivstatusfinal == 2                         ~ "0. negative",
                            hivstatusfinal == 1 & recentlagvlarv == 1   ~ "1. recent",
                            hivstatusfinal == 1 & recentlagvlarv == 2   ~ "2. long-term",
                            hivstatusfinal == 1 & is.na(recentlagvlarv) |
                              !recentlagvlarv %in% 1:2                  ~ "9. not tested"),
    recent_prev = factor(recent_prev,
                         levels = c("0. negative", "1. recent", "2. long-term", "9. not tested"))
  )

 dat %>%
  filter(btwt0 > 0) %>%
  count(survey_id, recent_prev) %>%
  spread(recent_prev, n) %>%
  print(n = Inf)

 dat <- dat %>%
  filter(!is.na(btwt0) & btwt0 > 0)

 dat <- dat %>%
  mutate(
    sex = recode(gender, `1` = "male", `2` = "female"),
    age_group = naomi::cut_naomi_age_group(age),
    age_group10 = age_group %>%
      fct_collapse("Y015_024" = c("Y015_019", "Y020_024"),
                   "Y025_034" = c("Y025_029", "Y030_034"),
                   "Y035_044" = c("Y035_039", "Y040_044"),
                   "Y045_054" = c("Y045_049", "Y050_054"),
                   "Y055_064" = c("Y055_059", "Y060_064"),
                   other_level = NA),
    age_15to49 = if_else(age %in% 15:49, "Y015_049", NA_character_),
    age_15to64 = if_else(age %in% 15:64, "Y015_064", NA_character_)
  )


 #' Split datasets by survey because some surveys use different MDRI and FRR

 datspl <- dat %>%
  split(.$survey_id)

 #' Create standard survey::survey.design object
 desspl <- datspl %>%
  lapply(\(x) svydesign(ids = ~centroidid, weights = ~btwt0, nest = TRUE, data = x))


 #' Use the function svyprevincid() to calculate the HIV incidence and prevalence.
 #'
 #' This function uses the delta method to also account for uncertainty about the
 #' mean duration of recent infection (MDRI) and false recent ratio (FRR).
 #'
 #' Some notes on the function:
 #'
 #' * The function returns a `svystat` object, the standard return from the survey package.
 #'   This allows standard specification for reporting out CIs, design effects, etc.
 #'
 #' * You can use it with `svyby()` to calculate incidence and prevalence stratified by
 #'   categorical variables, and get it to return 95% CIs, design effect, etc.
 #'
 #' * It takes as arguments the MDRI [in years] and FRR and the standard errors for
 #'   each of these.
 #'   - For PHIA surveys, the MDRI was assumed to be 135 days (= 0.370 years)
 #'     with 95% CI 118-142 days (SE = 6.12 days = 0.017 years)
 #'     id__EXCEPT for Uganda, which used 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036).
 #'   - For PHIA surveys, the FRR was assumed to be 0.0 with SE = 0.0
 #'   
 #'  * It does not allow NA values in the recent infection variable (it does not have an
 #'    `na.rm = TRUE` argument), so ensure to subset to remove NA calculation.
 #' 

 #' To calculate incidence and prevalence among adults 15-49 years (reproduces
 #' MPHIA 2015-16 summary sheet)
 svyprevincid(~recent_prev,
             design = subset(desspl$MWI2016PHIA, age %in% 15:49),
             frr = 0.0, se.frr = 0.0, deff = TRUE)

 #' Incidence by sex using svyby():
 svyby(~recent_prev, ~sex, subset(desspl$MWI2016PHIA, age %in% 15:49), svyprevincid,
      frr = 0.0, se.frr = 0.0, deff = TRUE)


 #' # Calculate incidence by sex and age categories

 mdri <- setNames(rep(130 / 365, length(desspl)), names(desspl))
 se.mdri <- mdri
 se.mdri[] <- (142 - 118)/365/(2 * qnorm(0.975))

 #' Uganda PHIA MDRI: 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036)
 mdri["UGA2017PHIA"] <- 153 / 365
 se.mdri["UGA2017PHIA"] <- (178 - 127)/365/(2 * qnorm(0.975))

 incid_sex_age_group <-  Map(\(des, mdri, se.mdri)
                            svyby(~recent_prev, ~survey_id + sex + age_group,
                                  des, svyprevincid,
                                  mdri = mdri, se.mdri = se.mdri,
                                  frr = 0.0, se.frr = 0.0, deff = TRUE),
                            desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows()

 incid_age_group <-  Map(\(des, mdri, se.mdri)
                        svyby(~recent_prev, ~survey_id + age_group,
                              des, svyprevincid,
                              mdri = mdri, se.mdri = se.mdri,
                              frr = 0.0, se.frr = 0.0, deff = TRUE),
                        desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  mutate(sex = "both")

 incid_sex_age_group10 <-  Map(\(des, mdri, se.mdri)
                              svyby(~recent_prev, ~survey_id + sex + age_group10,
                                    des, svyprevincid,
                                    mdri = mdri, se.mdri = se.mdri,
                                    frr = 0.0, se.frr = 0.0, deff = TRUE),
                              desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  rename(age_group = age_group10)

 incid_age_group10 <-  Map(\(des, mdri, se.mdri)
                          svyby(~recent_prev, ~survey_id + age_group10,
                                des, svyprevincid,
                                mdri = mdri, se.mdri = se.mdri,
                                frr = 0.0, se.frr = 0.0, deff = TRUE),
                          desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  mutate(sex = "both") %>%
  rename(age_group = age_group10)

 incid_sex_age_15to49 <-  Map(\(des, mdri, se.mdri)
                              svyby(~recent_prev, ~survey_id + sex + age_15to49,
                                    des, svyprevincid,
                                    mdri = mdri, se.mdri = se.mdri,
                                    frr = 0.0, se.frr = 0.0, deff = TRUE),
                              desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  rename(age_group = age_15to49)

 incid_age_15to49 <-  Map(\(des, mdri, se.mdri)
                          svyby(~recent_prev, ~survey_id + age_15to49,
                                des, svyprevincid,
                                mdri = mdri, se.mdri = se.mdri,
                                frr = 0.0, se.frr = 0.0, deff = TRUE),
                          desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  mutate(sex = "both") %>%
  rename(age_group = age_15to49)


 incid_sex_age_15to64 <-  Map(\(des, mdri, se.mdri)
                              svyby(~recent_prev, ~survey_id + sex + age_15to64,
                                    des, svyprevincid,
                                    mdri = mdri, se.mdri = se.mdri,
                                    frr = 0.0, se.frr = 0.0, deff = TRUE),
                              desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  rename(age_group = age_15to64)

 incid_age_15to64 <-  Map(\(des, mdri, se.mdri)
                          svyby(~recent_prev, ~survey_id + age_15to64,
                                des, svyprevincid,
                                mdri = mdri, se.mdri = se.mdri,
                                frr = 0.0, se.frr = 0.0, deff = TRUE),
                          desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
  bind_rows() %>%
  mutate(sex = "both") %>%
  rename(age_group = age_15to64)



 incid <- bind_rows(incid_sex_age_15to49, incid_age_15to49,
                   incid_sex_age_15to64, incid_age_15to64,
                   incid_sex_age_group10, incid_age_group10,
                   incid_sex_age_group, incid_age_group)

 #' Add 95% CI on log scale (avoid ci_lower < 0))
 incid <- incid %>%
  mutate(
    se.log_incid = se.incid / incid,
    ci_l.incid = exp(log(incid) - qnorm(0.975) * se.log_incid),
    ci_u.incid = exp(log(incid) + qnorm(0.975) * se.log_incid),
    se.log_incid = NULL
  )

 incid <- right_join(phia_info, incid) %>%
  as_tibble()

 #' Add raw counts
 #'
 #' Assuming FRR = 0 and all HIV positive are tested for recent infection,
 #' the incidence calculation becomes: [incidence] = [# recent] / (MDRI * [# negative]).
 #'
 #' Assuming no uncertaity in MDRI, the Poisson approximation becomes:
 #' [# recent] ~ Poisson( [incidence] * MDRI * [# negative])
 #'
 #' -> Use:
 #'    - n_pys_approx = MDRI * [# negative] as an approximation to PYs
 #'    - x_recent_approx = incidence * n_pys_approx
 #'
 #' The approximation ignores uncertainty from:
 #' 1. Design effects (clustering and unequal sample weights)
 #' 2. Proportion of HIV positive not tested for recent infection
 #' 3. Uncertainty about MDRI
 #'
 #' Number 1 could be partially accounted for with Kish effective sample size (as
 #' with other Naomi inputs) and number 2 through deflating the number negative
 #' denominator by the proportion of positive respondents tested for HIV.
 #' In practice proportion missing recent is pretty small in PHIA surveys, so
 #' unlikely to be a major difference.
 #'

 dat <- dat %>%
  mutate(
    recent_prev_label = recent_prev %>%
      recode("0. negative" = "n_negative",
             "1. recent" = "n_pos_recent",
             "2. long-term" = "n_pos_longterm",
             "9. not tested" = "n_pos_nottest")
  )


 counts_sex_age_group <- dat %>%
  count(survey_id, sex, age_group, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_age_group <- dat %>%
  count(survey_id, sex = "both", age_group, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_sex_age_group10 <- dat %>%
  count(survey_id, sex, age_group = age_group10, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_age_group10 <- dat %>%
  count(survey_id, sex = "both", age_group = age_group10, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_sex_age_15to49 <- dat %>%
  count(survey_id, sex, age_group = age_15to49, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_age_15to49 <- dat %>%
  count(survey_id, sex = "both", age_group = age_15to49, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_sex_age_15to64 <- dat %>%
  count(survey_id, sex, age_group = age_15to64, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_age_15to64 <- dat %>%
  count(survey_id, sex = "both", age_group = age_15to64, recent_prev_label) %>%
  pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

 counts_raw <- bind_rows(counts_sex_age_15to49, counts_age_15to49,
                        counts_sex_age_15to64, counts_age_15to64,
                        counts_sex_age_group10, counts_age_group10,
                        counts_sex_age_group, counts_age_group) %>%
  filter(!is.na(age_group))

 counts_raw <- counts_raw %>%
  mutate(
    mdri = mdri[survey_id],
    mdri_se = se.mdri[survey_id]
  )
    
 incid <- incid %>%
  left_join(counts_raw, by = join_by(survey_id, sex, age_group)) %>%
  mutate(
    n_pys_approx = mdri * n_negative,
    x_recent_approx = incid * n_pys_approx
  )


 library(ggplot2)

 incid %>%
  filter(age_group == "Y015_049") %>%
  mutate(
    pys_eff = incid / se.incid^2,
    pys_deff_ratio = n_pys_approx / pys_eff,
    pys_ratio_mean = mean(pys_deff_ratio, na.rm = TRUE),
    .by = sex
  ) %>%
  ggplot(aes(survey_id, pys_deff_ratio)) +
  geom_point() +
  geom_hline(yintercept = 1.0, linetype = "dashed") +
  geom_hline(aes(yintercept = pys_ratio_mean), color = "blue") +
  facet_wrap(~sex) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Ratio of approx PYs to standard-error derived PYs: age 15-49")  


 write_csv(incid, "phia-incid-extract_2024-05-01.csv", na = "")
	library(dplyr)
	library(readr)
	library(tidyr)
	library(forcats)
	library(haven)
	library(survey)

	## devtools::install_github("jeffeaton/svyincid")
	library(svyincid)

	phia_path <- "~/Data/household surveys/PHIA/datasets"

	cmr2017phia <- rdhs::read_zipdata(file.path(phia_path, "CMR/datasets/302 CAMPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
	civ2018phia <- rdhs::read_zipdata(file.path(phia_path, "CIV/datasets/302 CIPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
	eth2018phia <- rdhs::read_zipdata(file.path(phia_path, "ETH/datasets/302 EPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
	ken2018phia <- rdhs::read_zipdata(file.path(phia_path, "KEN/datasets/KENPHIA 2018 Household Interview and Biomarker Datasets v1.1 (DTA).zip"), "kenphia2018adultbio.dta")
	lso2017phia <- rdhs::read_zipdata(file.path(phia_path, "LSO/datasets/302_LePHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
	mwi2016phia <- rdhs::read_zipdata(file.path(phia_path, "MWI/datasets/MPHIA 2015-2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "mphia2015adultbio.dta")
	nam2017phia <- rdhs::read_zipdata(file.path(phia_path, "NAM/datasets/302_NAMPHIA 2017 Adult Biomarker Dataset (DTA).zip"))
	rwa2019phia <- rdhs::read_zipdata(file.path(phia_path, "RWA/datasets/302 RPHIA 2018-2019 Adult Biomarker Dataset (DTA).zip"))
	swz2017phia <- rdhs::read_zipdata(file.path(phia_path, "SWZ/datasets/SHIMS2 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "shims22016adultbio.dta")
	tza2017phia <- rdhs::read_zipdata(file.path(phia_path, "TZA/datasets/THIS 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "this2016adultbio.dta")
	uga2017phia <- rdhs::read_zipdata(file.path(phia_path, "UGA/datasets/302 UPHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
	zmb2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZMB/datasets/ZAMPHIA 2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "zamphia2016adultbio.dta")
	zwe2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZWE/datasets/302_ZIMPHIA 2015-2016 Adult Biomarker Dataset (DTA).zip"))
	nga2021phia <- haven::read_dta("~/Data/household surveys/Nigeria/NAIIS/public/datasets/NAIIS2018adultbio.dta")

	#' ## PHIA2
	mwi2020phia <- haven::read_dta(unz(file.path(phia_path, "MWI2020PHIA/datasets/MPHIA 2020-2021 Household Interview and Biomarker Data (DTA).zip"), "mphia2020adultbio.dta"))
	lso2020phia <- haven::read_dta(unz(file.path(phia_path, "LSO2020PHIA/datasets/LePHIA 2020 Household (DTA).zip"), "lephia2020adultbio.dta"))
	moz2021phia <- haven::read_dta(unz(file.path(phia_path, "MOZ2021PHIA/datasets/INSIDA 2021 Household Interview and Biomarker Datasets (DTA).zip"), "insida2021adultbio.dta"))
	swz2021phia <- haven::read_dta(unz(file.path(phia_path, "SWZ2021PHIA/datasets/SHIMS3 2021 Household Interview and Biomarker Datasets (DTA).zip"), "shims32021adultbio.dta"))
	zwe2020phia <- haven::read_dta(unz(file.path(phia_path, "ZWE2020PHIA/datasets/ZIMPHIA 2020 Household (DTA).zip"), "zimphia2020adultbio.dta"))
	zmb2021phia <- haven::read_dta(unz(file.path(phia_path, "ZMB2021PHIA/ZAMPHIA 2021 PU Package.zip"), "zamphia2021adultbio.dta"))
	bwa2021phia <- haven::read_dta("~/Data/household surveys/Botswana/BAIS V/datasets/baisv2021adultbio.dta.zip")

	nga2021phia$centroidid <- sprintf("NG%04d%05d", nga2021phia$varstrat, nga2021phia$varunit)

	datl <- grep("[a-z]{3}20[0-9]{2}phia$", ls(), value = TRUE) %>%
	{setNames(lapply(., get), toupper(.))}

	dat <- datl %>%
	lapply(select, any_of(c("country", "householdid", "personid", "centroidid", "gender", "age", "hivstatusfinal", "recentlagvl", "recentlagvlarv", "btwt0"))) %>%
	lapply(haven::zap_labels) %>%
	Map(mutate, ., survey_id = names(.)) %>%
	bind_rows()

	phia_info <- tribble(
	~survey_id, ~iso3, ~country, ~survey_year_label, ~survey_year, ~phia_phase,
	"CMR2017PHIA", "CMR", "Cameroon", "2017", 2017, "PHIA1",
	"CIV2018PHIA", "CIV", "Cote d'Ivoire", "2017-2018", 2018, "PHIA1",
	"SWZ2017PHIA", "SWZ", "Eswatini", "2016-2017", 2017, "PHIA1",
	"ETH2018PHIA", "ETH", "Ethiopia", "2017-2018", 2018, "PHIA1",
	"KEN2018PHIA", "KEN", "Kenya", "2018", 2018, "PHIA1",
	"LSO2017PHIA", "LSO", "Lesotho", "2016-2017", 2017, "PHIA1",
	"MWI2016PHIA", "MWI", "Malawi", "2015-2016", 2016, "PHIA1",
	"NAM2017PHIA", "NAM", "Namibia", "2017", 2017, "PHIA1",
	"RWA2019PHIA", "RWA", "Rwanda", "2018-2019", 2019, "PHIA1",
	"TZA2017PHIA", "TZA", "Tanzania", "2016-2017", 2017, "PHIA1",
	"UGA2017PHIA", "UGA", "Uganda", "2016-2017", 2017, "PHIA1",
	"ZMB2016PHIA", "ZMB", "Zambia", "2016", 2016, "PHIA1",
	"ZWE2016PHIA", "ZWE", "Zimbabwe", "2015-2016", 2016, "PHIA1",
	"NGA2021PHIA", "NGA", "Nigeria", "2018", 2018, "PHIA1",
	##
	"LSO2020PHIA", "LSO", "Lesotho", "2020", 2020, "PHIA2",
	"MOZ2021PHIA", "MOZ", "Mozambique", "2021", 2021, "PHIA2",
	"MWI2020PHIA", "MWI", "Malawi", "2020", 2020, "PHIA2",
	"SWZ2021PHIA", "SWZ", "Eswatini", "2021", 2021, "PHIA2",
	"ZWE2020PHIA", "ZWE", "Zimbabwe", "2020", 2020, "PHIA2",
	"ZMB2021PHIA", "ZMB", "Zambia", "2021", 2021, "PHIA2",
	"BWA2021PHIA", "BWA", "Botswana", "2021", 2021, "PHIA2"
	)

	dat <- dat %>%
	select(-country) %>%
	left_join(phia_info)

	dat %>%
	count(survey_id, hivstatusfinal) %>%
	spread(hivstatusfinal, n) %>%
	print(n = Inf)

	dat <- dat %>%
	mutate(
	hivstatus
	dat %>%
	count(survey_id, recentlagvlarv) %>%
	spread(recentlagvlarv, n) %>%
	print(n = Inf)


	#' Recode the recent infection status as quadrinomial outcome:
	#' "0. negative": HIV negative
	#' "1. recent": HIV positive and recent HIV infection
	#' "2. long-term": HIV positive and long-term infection (not recent)
	#' "9. not tested": HIV positive and no recent infection available (assumed missing-at-random)
	#'
	#' It must be a factor variable with four levels in this order.
	#' The value labels are not important, but the order is important.
	#'

	dat <- dat %>%
	mutate(
	recent_prev = case_when(hivstatusfinal == 2 ~ "0. negative",
	hivstatusfinal == 1 & recentlagvlarv == 1 ~ "1. recent",
	hivstatusfinal == 1 & recentlagvlarv == 2 ~ "2. long-term",
	hivstatusfinal == 1 & is.na(recentlagvlarv) \|
	!recentlagvlarv %in% 1:2 ~ "9. not tested"),
	recent_prev = factor(recent_prev,
	levels = c("0. negative", "1. recent", "2. long-term", "9. not tested"))
	)

	dat %>%
	filter(btwt0 > 0) %>%
	count(survey_id, recent_prev) %>%
	spread(recent_prev, n) %>%
	print(n = Inf)

	dat <- dat %>%
	filter(!is.na(btwt0) & btwt0 > 0)

	dat <- dat %>%
	mutate(
	sex = recode(gender, `1` = "male", `2` = "female"),
	age_group = naomi::cut_naomi_age_group(age),
	age_group10 = age_group %>%
	fct_collapse("Y015_024" = c("Y015_019", "Y020_024"),
	"Y025_034" = c("Y025_029", "Y030_034"),
	"Y035_044" = c("Y035_039", "Y040_044"),
	"Y045_054" = c("Y045_049", "Y050_054"),
	"Y055_064" = c("Y055_059", "Y060_064"),
	other_level = NA),
	age_15to49 = if_else(age %in% 15:49, "Y015_049", NA_character_),
	age_15to64 = if_else(age %in% 15:64, "Y015_064", NA_character_)
	)


	#' Split datasets by survey because some surveys use different MDRI and FRR

	datspl <- dat %>%
	split(.$survey_id)

	#' Create standard survey::survey.design object
	desspl <- datspl %>%
	lapply(\(x) svydesign(ids = ~centroidid, weights = ~btwt0, nest = TRUE, data = x))


	#' Use the function svyprevincid() to calculate the HIV incidence and prevalence.
	#'
	#' This function uses the delta method to also account for uncertainty about the
	#' mean duration of recent infection (MDRI) and false recent ratio (FRR).
	#'
	#' Some notes on the function:
	#'
	#' * The function returns a `svystat` object, the standard return from the survey package.
	#' This allows standard specification for reporting out CIs, design effects, etc.
	#'
	#' * You can use it with `svyby()` to calculate incidence and prevalence stratified by
	#' categorical variables, and get it to return 95% CIs, design effect, etc.
	#'
	#' * It takes as arguments the MDRI [in years] and FRR and the standard errors for
	#' each of these.
	#' - For PHIA surveys, the MDRI was assumed to be 135 days (= 0.370 years)
	#' with 95% CI 118-142 days (SE = 6.12 days = 0.017 years)
	#' id__EXCEPT for Uganda, which used 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036).
	#' - For PHIA surveys, the FRR was assumed to be 0.0 with SE = 0.0
	#'
	#' * It does not allow NA values in the recent infection variable (it does not have an
	#' `na.rm = TRUE` argument), so ensure to subset to remove NA calculation.
	#'

	#' To calculate incidence and prevalence among adults 15-49 years (reproduces
	#' MPHIA 2015-16 summary sheet)
	svyprevincid(~recent_prev,
	design = subset(desspl$MWI2016PHIA, age %in% 15:49),
	frr = 0.0, se.frr = 0.0, deff = TRUE)

	#' Incidence by sex using svyby():
	svyby(~recent_prev, ~sex, subset(desspl$MWI2016PHIA, age %in% 15:49), svyprevincid,
	frr = 0.0, se.frr = 0.0, deff = TRUE)


	#' # Calculate incidence by sex and age categories

	mdri <- setNames(rep(130 / 365, length(desspl)), names(desspl))
	se.mdri <- mdri
	se.mdri[] <- (142 - 118)/365/(2 * qnorm(0.975))

	#' Uganda PHIA MDRI: 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036)
	mdri["UGA2017PHIA"] <- 153 / 365
	se.mdri["UGA2017PHIA"] <- (178 - 127)/365/(2 * qnorm(0.975))

	incid_sex_age_group <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + sex + age_group,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows()

	incid_age_group <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + age_group,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	mutate(sex = "both")

	incid_sex_age_group10 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + sex + age_group10,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	rename(age_group = age_group10)

	incid_age_group10 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + age_group10,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	mutate(sex = "both") %>%
	rename(age_group = age_group10)

	incid_sex_age_15to49 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + sex + age_15to49,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	rename(age_group = age_15to49)

	incid_age_15to49 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + age_15to49,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	mutate(sex = "both") %>%
	rename(age_group = age_15to49)


	incid_sex_age_15to64 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + sex + age_15to64,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	rename(age_group = age_15to64)

	incid_age_15to64 <- Map(\(des, mdri, se.mdri)
	svyby(~recent_prev, ~survey_id + age_15to64,
	des, svyprevincid,
	mdri = mdri, se.mdri = se.mdri,
	frr = 0.0, se.frr = 0.0, deff = TRUE),
	desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
	bind_rows() %>%
	mutate(sex = "both") %>%
	rename(age_group = age_15to64)



	incid <- bind_rows(incid_sex_age_15to49, incid_age_15to49,
	incid_sex_age_15to64, incid_age_15to64,
	incid_sex_age_group10, incid_age_group10,
	incid_sex_age_group, incid_age_group)

	#' Add 95% CI on log scale (avoid ci_lower < 0))
	incid <- incid %>%
	mutate(
	se.log_incid = se.incid / incid,
	ci_l.incid = exp(log(incid) - qnorm(0.975) * se.log_incid),
	ci_u.incid = exp(log(incid) + qnorm(0.975) * se.log_incid),
	se.log_incid = NULL
	)

	incid <- right_join(phia_info, incid) %>%
	as_tibble()

	#' Add raw counts
	#'
	#' Assuming FRR = 0 and all HIV positive are tested for recent infection,
	#' the incidence calculation becomes: [incidence] = [# recent] / (MDRI * [# negative]).
	#'
	#' Assuming no uncertaity in MDRI, the Poisson approximation becomes:
	#' [# recent] ~ Poisson( [incidence] * MDRI * [# negative])
	#'
	#' -> Use:
	#' - n_pys_approx = MDRI * [# negative] as an approximation to PYs
	#' - x_recent_approx = incidence * n_pys_approx
	#'
	#' The approximation ignores uncertainty from:
	#' 1. Design effects (clustering and unequal sample weights)
	#' 2. Proportion of HIV positive not tested for recent infection
	#' 3. Uncertainty about MDRI
	#'
	#' Number 1 could be partially accounted for with Kish effective sample size (as
	#' with other Naomi inputs) and number 2 through deflating the number negative
	#' denominator by the proportion of positive respondents tested for HIV.
	#' In practice proportion missing recent is pretty small in PHIA surveys, so
	#' unlikely to be a major difference.
	#'

	dat <- dat %>%
	mutate(
	recent_prev_label = recent_prev %>%
	recode("0. negative" = "n_negative",
	"1. recent" = "n_pos_recent",
	"2. long-term" = "n_pos_longterm",
	"9. not tested" = "n_pos_nottest")
	)


	counts_sex_age_group <- dat %>%
	count(survey_id, sex, age_group, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_age_group <- dat %>%
	count(survey_id, sex = "both", age_group, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_sex_age_group10 <- dat %>%
	count(survey_id, sex, age_group = age_group10, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_age_group10 <- dat %>%
	count(survey_id, sex = "both", age_group = age_group10, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_sex_age_15to49 <- dat %>%
	count(survey_id, sex, age_group = age_15to49, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_age_15to49 <- dat %>%
	count(survey_id, sex = "both", age_group = age_15to49, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_sex_age_15to64 <- dat %>%
	count(survey_id, sex, age_group = age_15to64, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_age_15to64 <- dat %>%
	count(survey_id, sex = "both", age_group = age_15to64, recent_prev_label) %>%
	pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)

	counts_raw <- bind_rows(counts_sex_age_15to49, counts_age_15to49,
	counts_sex_age_15to64, counts_age_15to64,
	counts_sex_age_group10, counts_age_group10,
	counts_sex_age_group, counts_age_group) %>%
	filter(!is.na(age_group))

	counts_raw <- counts_raw %>%
	mutate(
	mdri = mdri[survey_id],
	mdri_se = se.mdri[survey_id]
	)

	incid <- incid %>%
	left_join(counts_raw, by = join_by(survey_id, sex, age_group)) %>%
	mutate(
	n_pys_approx = mdri * n_negative,
	x_recent_approx = incid * n_pys_approx
	)


	library(ggplot2)

	incid %>%
	filter(age_group == "Y015_049") %>%
	mutate(
	pys_eff = incid / se.incid^2,
	pys_deff_ratio = n_pys_approx / pys_eff,
	pys_ratio_mean = mean(pys_deff_ratio, na.rm = TRUE),
	.by = sex
	) %>%
	ggplot(aes(survey_id, pys_deff_ratio)) +
	geom_point() +
	geom_hline(yintercept = 1.0, linetype = "dashed") +
	geom_hline(aes(yintercept = pys_ratio_mean), color = "blue") +
	facet_wrap(~sex) +
	theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
	labs(title = "Ratio of approx PYs to standard-error derived PYs: age 15-49")


	write_csv(incid, "phia-incid-extract_2024-05-01.csv", na = "")