Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jeffeaton/ed34932ee787f721cc7cc67a5a3bc02f to your computer and use it in GitHub Desktop.
Save jeffeaton/ed34932ee787f721cc7cc67a5a3bc02f to your computer and use it in GitHub Desktop.
PHIA HIV incidence survey extract for UNAIDS Reference Group incidence analysis
library(dplyr)
library(readr)
library(tidyr)
library(forcats)
library(haven)
library(survey)
## devtools::install_github("jeffeaton/svyincid")
library(svyincid)
phia_path <- "~/Data/household surveys/PHIA/datasets"
cmr2017phia <- rdhs::read_zipdata(file.path(phia_path, "CMR/datasets/302 CAMPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
civ2018phia <- rdhs::read_zipdata(file.path(phia_path, "CIV/datasets/302 CIPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
eth2018phia <- rdhs::read_zipdata(file.path(phia_path, "ETH/datasets/302 EPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip"))
ken2018phia <- rdhs::read_zipdata(file.path(phia_path, "KEN/datasets/KENPHIA 2018 Household Interview and Biomarker Datasets v1.1 (DTA).zip"), "kenphia2018adultbio.dta")
lso2017phia <- rdhs::read_zipdata(file.path(phia_path, "LSO/datasets/302_LePHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
mwi2016phia <- rdhs::read_zipdata(file.path(phia_path, "MWI/datasets/MPHIA 2015-2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "mphia2015adultbio.dta")
nam2017phia <- rdhs::read_zipdata(file.path(phia_path, "NAM/datasets/302_NAMPHIA 2017 Adult Biomarker Dataset (DTA).zip"))
rwa2019phia <- rdhs::read_zipdata(file.path(phia_path, "RWA/datasets/302 RPHIA 2018-2019 Adult Biomarker Dataset (DTA).zip"))
swz2017phia <- rdhs::read_zipdata(file.path(phia_path, "SWZ/datasets/SHIMS2 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "shims22016adultbio.dta")
tza2017phia <- rdhs::read_zipdata(file.path(phia_path, "TZA/datasets/THIS 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "this2016adultbio.dta")
uga2017phia <- rdhs::read_zipdata(file.path(phia_path, "UGA/datasets/302 UPHIA 2016-2017 Adult Biomarker Dataset (DTA).zip"))
zmb2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZMB/datasets/ZAMPHIA 2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "zamphia2016adultbio.dta")
zwe2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZWE/datasets/302_ZIMPHIA 2015-2016 Adult Biomarker Dataset (DTA).zip"))
nga2021phia <- haven::read_dta("~/Data/household surveys/Nigeria/NAIIS/public/datasets/NAIIS2018adultbio.dta")
#' ## PHIA2
mwi2020phia <- haven::read_dta(unz(file.path(phia_path, "MWI2020PHIA/datasets/MPHIA 2020-2021 Household Interview and Biomarker Data (DTA).zip"), "mphia2020adultbio.dta"))
lso2020phia <- haven::read_dta(unz(file.path(phia_path, "LSO2020PHIA/datasets/LePHIA 2020 Household (DTA).zip"), "lephia2020adultbio.dta"))
moz2021phia <- haven::read_dta(unz(file.path(phia_path, "MOZ2021PHIA/datasets/INSIDA 2021 Household Interview and Biomarker Datasets (DTA).zip"), "insida2021adultbio.dta"))
swz2021phia <- haven::read_dta(unz(file.path(phia_path, "SWZ2021PHIA/datasets/SHIMS3 2021 Household Interview and Biomarker Datasets (DTA).zip"), "shims32021adultbio.dta"))
zwe2020phia <- haven::read_dta(unz(file.path(phia_path, "ZWE2020PHIA/datasets/ZIMPHIA 2020 Household (DTA).zip"), "zimphia2020adultbio.dta"))
zmb2021phia <- haven::read_dta(unz(file.path(phia_path, "ZMB2021PHIA/ZAMPHIA 2021 PU Package.zip"), "zamphia2021adultbio.dta"))
bwa2021phia <- haven::read_dta("~/Data/household surveys/Botswana/BAIS V/datasets/baisv2021adultbio.dta.zip")
nga2021phia$centroidid <- sprintf("NG%04d%05d", nga2021phia$varstrat, nga2021phia$varunit)
datl <- grep("[a-z]{3}20[0-9]{2}phia$", ls(), value = TRUE) %>%
{setNames(lapply(., get), toupper(.))}
dat <- datl %>%
lapply(select, any_of(c("country", "householdid", "personid", "centroidid", "gender", "age", "hivstatusfinal", "recentlagvl", "recentlagvlarv", "btwt0"))) %>%
lapply(haven::zap_labels) %>%
Map(mutate, ., survey_id = names(.)) %>%
bind_rows()
phia_info <- tribble(
~survey_id, ~iso3, ~country, ~survey_year_label, ~survey_year, ~phia_phase,
"CMR2017PHIA", "CMR", "Cameroon", "2017", 2017, "PHIA1",
"CIV2018PHIA", "CIV", "Cote d'Ivoire", "2017-2018", 2018, "PHIA1",
"SWZ2017PHIA", "SWZ", "Eswatini", "2016-2017", 2017, "PHIA1",
"ETH2018PHIA", "ETH", "Ethiopia", "2017-2018", 2018, "PHIA1",
"KEN2018PHIA", "KEN", "Kenya", "2018", 2018, "PHIA1",
"LSO2017PHIA", "LSO", "Lesotho", "2016-2017", 2017, "PHIA1",
"MWI2016PHIA", "MWI", "Malawi", "2015-2016", 2016, "PHIA1",
"NAM2017PHIA", "NAM", "Namibia", "2017", 2017, "PHIA1",
"RWA2019PHIA", "RWA", "Rwanda", "2018-2019", 2019, "PHIA1",
"TZA2017PHIA", "TZA", "Tanzania", "2016-2017", 2017, "PHIA1",
"UGA2017PHIA", "UGA", "Uganda", "2016-2017", 2017, "PHIA1",
"ZMB2016PHIA", "ZMB", "Zambia", "2016", 2016, "PHIA1",
"ZWE2016PHIA", "ZWE", "Zimbabwe", "2015-2016", 2016, "PHIA1",
"NGA2021PHIA", "NGA", "Nigeria", "2018", 2018, "PHIA1",
##
"LSO2020PHIA", "LSO", "Lesotho", "2020", 2020, "PHIA2",
"MOZ2021PHIA", "MOZ", "Mozambique", "2021", 2021, "PHIA2",
"MWI2020PHIA", "MWI", "Malawi", "2020", 2020, "PHIA2",
"SWZ2021PHIA", "SWZ", "Eswatini", "2021", 2021, "PHIA2",
"ZWE2020PHIA", "ZWE", "Zimbabwe", "2020", 2020, "PHIA2",
"ZMB2021PHIA", "ZMB", "Zambia", "2021", 2021, "PHIA2",
"BWA2021PHIA", "BWA", "Botswana", "2021", 2021, "PHIA2"
)
dat <- dat %>%
select(-country) %>%
left_join(phia_info)
dat %>%
count(survey_id, hivstatusfinal) %>%
spread(hivstatusfinal, n) %>%
print(n = Inf)
dat <- dat %>%
mutate(
hivstatus
dat %>%
count(survey_id, recentlagvlarv) %>%
spread(recentlagvlarv, n) %>%
print(n = Inf)
#' Recode the recent infection status as quadrinomial outcome:
#' "0. negative": HIV negative
#' "1. recent": HIV positive and recent HIV infection
#' "2. long-term": HIV positive and long-term infection (not recent)
#' "9. not tested": HIV positive and no recent infection available (assumed missing-at-random)
#'
#' It must be a **factor variable with four levels** in this order.
#' The value labels are not important, but the order is important.
#'
dat <- dat %>%
mutate(
recent_prev = case_when(hivstatusfinal == 2 ~ "0. negative",
hivstatusfinal == 1 & recentlagvlarv == 1 ~ "1. recent",
hivstatusfinal == 1 & recentlagvlarv == 2 ~ "2. long-term",
hivstatusfinal == 1 & is.na(recentlagvlarv) |
!recentlagvlarv %in% 1:2 ~ "9. not tested"),
recent_prev = factor(recent_prev,
levels = c("0. negative", "1. recent", "2. long-term", "9. not tested"))
)
dat %>%
filter(btwt0 > 0) %>%
count(survey_id, recent_prev) %>%
spread(recent_prev, n) %>%
print(n = Inf)
dat <- dat %>%
filter(!is.na(btwt0) & btwt0 > 0)
dat <- dat %>%
mutate(
sex = recode(gender, `1` = "male", `2` = "female"),
age_group = naomi::cut_naomi_age_group(age),
age_group10 = age_group %>%
fct_collapse("Y015_024" = c("Y015_019", "Y020_024"),
"Y025_034" = c("Y025_029", "Y030_034"),
"Y035_044" = c("Y035_039", "Y040_044"),
"Y045_054" = c("Y045_049", "Y050_054"),
"Y055_064" = c("Y055_059", "Y060_064"),
other_level = NA),
age_15to49 = if_else(age %in% 15:49, "Y015_049", NA_character_),
age_15to64 = if_else(age %in% 15:64, "Y015_064", NA_character_)
)
#' Split datasets by survey because some surveys use different MDRI and FRR
datspl <- dat %>%
split(.$survey_id)
#' Create standard survey::survey.design object
desspl <- datspl %>%
lapply(\(x) svydesign(ids = ~centroidid, weights = ~btwt0, nest = TRUE, data = x))
#' Use the function svyprevincid() to calculate the HIV incidence and prevalence.
#'
#' This function uses the delta method to also account for uncertainty about the
#' mean duration of recent infection (MDRI) and false recent ratio (FRR).
#'
#' Some notes on the function:
#'
#' * The function returns a `svystat` object, the standard return from the survey package.
#' This allows standard specification for reporting out CIs, design effects, etc.
#'
#' * You can use it with `svyby()` to calculate incidence and prevalence stratified by
#' categorical variables, and get it to return 95% CIs, design effect, etc.
#'
#' * It takes as arguments the MDRI [in years] and FRR and the standard errors for
#' each of these.
#' - For PHIA surveys, the MDRI was assumed to be 135 days (= 0.370 years)
#' with 95% CI 118-142 days (SE = 6.12 days = 0.017 years)
#' id__EXCEPT for Uganda, which used 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036).
#' - For PHIA surveys, the FRR was assumed to be 0.0 with SE = 0.0
#'
#' * It does not allow NA values in the recent infection variable (it does not have an
#' `na.rm = TRUE` argument), so ensure to subset to remove NA calculation.
#'
#' To calculate incidence and prevalence among adults 15-49 years (reproduces
#' MPHIA 2015-16 summary sheet)
svyprevincid(~recent_prev,
design = subset(desspl$MWI2016PHIA, age %in% 15:49),
frr = 0.0, se.frr = 0.0, deff = TRUE)
#' Incidence by sex using svyby():
svyby(~recent_prev, ~sex, subset(desspl$MWI2016PHIA, age %in% 15:49), svyprevincid,
frr = 0.0, se.frr = 0.0, deff = TRUE)
#' # Calculate incidence by sex and age categories
mdri <- setNames(rep(130 / 365, length(desspl)), names(desspl))
se.mdri <- mdri
se.mdri[] <- (142 - 118)/365/(2 * qnorm(0.975))
#' Uganda PHIA MDRI: 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036)
mdri["UGA2017PHIA"] <- 153 / 365
se.mdri["UGA2017PHIA"] <- (178 - 127)/365/(2 * qnorm(0.975))
incid_sex_age_group <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + sex + age_group,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows()
incid_age_group <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + age_group,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
mutate(sex = "both")
incid_sex_age_group10 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + sex + age_group10,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
rename(age_group = age_group10)
incid_age_group10 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + age_group10,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
mutate(sex = "both") %>%
rename(age_group = age_group10)
incid_sex_age_15to49 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + sex + age_15to49,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
rename(age_group = age_15to49)
incid_age_15to49 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + age_15to49,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
mutate(sex = "both") %>%
rename(age_group = age_15to49)
incid_sex_age_15to64 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + sex + age_15to64,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
rename(age_group = age_15to64)
incid_age_15to64 <- Map(\(des, mdri, se.mdri)
svyby(~recent_prev, ~survey_id + age_15to64,
des, svyprevincid,
mdri = mdri, se.mdri = se.mdri,
frr = 0.0, se.frr = 0.0, deff = TRUE),
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>%
bind_rows() %>%
mutate(sex = "both") %>%
rename(age_group = age_15to64)
incid <- bind_rows(incid_sex_age_15to49, incid_age_15to49,
incid_sex_age_15to64, incid_age_15to64,
incid_sex_age_group10, incid_age_group10,
incid_sex_age_group, incid_age_group)
#' Add 95% CI on log scale (avoid ci_lower < 0))
incid <- incid %>%
mutate(
se.log_incid = se.incid / incid,
ci_l.incid = exp(log(incid) - qnorm(0.975) * se.log_incid),
ci_u.incid = exp(log(incid) + qnorm(0.975) * se.log_incid),
se.log_incid = NULL
)
incid <- right_join(phia_info, incid) %>%
as_tibble()
#' Add raw counts
#'
#' Assuming FRR = 0 and all HIV positive are tested for recent infection,
#' the incidence calculation becomes: [incidence] = [# recent] / (MDRI * [# negative]).
#'
#' Assuming no uncertaity in MDRI, the Poisson approximation becomes:
#' [# recent] ~ Poisson( [incidence] * MDRI * [# negative])
#'
#' -> Use:
#' - n_pys_approx = MDRI * [# negative] as an approximation to PYs
#' - x_recent_approx = incidence * n_pys_approx
#'
#' The approximation ignores uncertainty from:
#' 1. Design effects (clustering and unequal sample weights)
#' 2. Proportion of HIV positive not tested for recent infection
#' 3. Uncertainty about MDRI
#'
#' Number 1 could be partially accounted for with Kish effective sample size (as
#' with other Naomi inputs) and number 2 through deflating the number negative
#' denominator by the proportion of positive respondents tested for HIV.
#' In practice proportion missing recent is pretty small in PHIA surveys, so
#' unlikely to be a major difference.
#'
dat <- dat %>%
mutate(
recent_prev_label = recent_prev %>%
recode("0. negative" = "n_negative",
"1. recent" = "n_pos_recent",
"2. long-term" = "n_pos_longterm",
"9. not tested" = "n_pos_nottest")
)
counts_sex_age_group <- dat %>%
count(survey_id, sex, age_group, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_age_group <- dat %>%
count(survey_id, sex = "both", age_group, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_sex_age_group10 <- dat %>%
count(survey_id, sex, age_group = age_group10, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_age_group10 <- dat %>%
count(survey_id, sex = "both", age_group = age_group10, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_sex_age_15to49 <- dat %>%
count(survey_id, sex, age_group = age_15to49, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_age_15to49 <- dat %>%
count(survey_id, sex = "both", age_group = age_15to49, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_sex_age_15to64 <- dat %>%
count(survey_id, sex, age_group = age_15to64, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_age_15to64 <- dat %>%
count(survey_id, sex = "both", age_group = age_15to64, recent_prev_label) %>%
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0)
counts_raw <- bind_rows(counts_sex_age_15to49, counts_age_15to49,
counts_sex_age_15to64, counts_age_15to64,
counts_sex_age_group10, counts_age_group10,
counts_sex_age_group, counts_age_group) %>%
filter(!is.na(age_group))
counts_raw <- counts_raw %>%
mutate(
mdri = mdri[survey_id],
mdri_se = se.mdri[survey_id]
)
incid <- incid %>%
left_join(counts_raw, by = join_by(survey_id, sex, age_group)) %>%
mutate(
n_pys_approx = mdri * n_negative,
x_recent_approx = incid * n_pys_approx
)
library(ggplot2)
incid %>%
filter(age_group == "Y015_049") %>%
mutate(
pys_eff = incid / se.incid^2,
pys_deff_ratio = n_pys_approx / pys_eff,
pys_ratio_mean = mean(pys_deff_ratio, na.rm = TRUE),
.by = sex
) %>%
ggplot(aes(survey_id, pys_deff_ratio)) +
geom_point() +
geom_hline(yintercept = 1.0, linetype = "dashed") +
geom_hline(aes(yintercept = pys_ratio_mean), color = "blue") +
facet_wrap(~sex) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Ratio of approx PYs to standard-error derived PYs: age 15-49")
write_csv(incid, "phia-incid-extract_2024-05-01.csv", na = "")
@jeffeaton
Copy link
Author

incidence-approx-deff_2024-05-01

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment