Created
May 1, 2024 13:12
-
-
Save jeffeaton/ed34932ee787f721cc7cc67a5a3bc02f to your computer and use it in GitHub Desktop.
PHIA HIV incidence survey extract for UNAIDS Reference Group incidence analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(readr) | |
library(tidyr) | |
library(forcats) | |
library(haven) | |
library(survey) | |
## devtools::install_github("jeffeaton/svyincid") | |
library(svyincid) | |
phia_path <- "~/Data/household surveys/PHIA/datasets" | |
cmr2017phia <- rdhs::read_zipdata(file.path(phia_path, "CMR/datasets/302 CAMPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip")) | |
civ2018phia <- rdhs::read_zipdata(file.path(phia_path, "CIV/datasets/302 CIPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip")) | |
eth2018phia <- rdhs::read_zipdata(file.path(phia_path, "ETH/datasets/302 EPHIA 2017-2018 Adult Biomarker Dataset (DTA).zip")) | |
ken2018phia <- rdhs::read_zipdata(file.path(phia_path, "KEN/datasets/KENPHIA 2018 Household Interview and Biomarker Datasets v1.1 (DTA).zip"), "kenphia2018adultbio.dta") | |
lso2017phia <- rdhs::read_zipdata(file.path(phia_path, "LSO/datasets/302_LePHIA 2016-2017 Adult Biomarker Dataset (DTA).zip")) | |
mwi2016phia <- rdhs::read_zipdata(file.path(phia_path, "MWI/datasets/MPHIA 2015-2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "mphia2015adultbio.dta") | |
nam2017phia <- rdhs::read_zipdata(file.path(phia_path, "NAM/datasets/302_NAMPHIA 2017 Adult Biomarker Dataset (DTA).zip")) | |
rwa2019phia <- rdhs::read_zipdata(file.path(phia_path, "RWA/datasets/302 RPHIA 2018-2019 Adult Biomarker Dataset (DTA).zip")) | |
swz2017phia <- rdhs::read_zipdata(file.path(phia_path, "SWZ/datasets/SHIMS2 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "shims22016adultbio.dta") | |
tza2017phia <- rdhs::read_zipdata(file.path(phia_path, "TZA/datasets/THIS 2016-2017 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "this2016adultbio.dta") | |
uga2017phia <- rdhs::read_zipdata(file.path(phia_path, "UGA/datasets/302 UPHIA 2016-2017 Adult Biomarker Dataset (DTA).zip")) | |
zmb2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZMB/datasets/ZAMPHIA 2016 Household Interview and Biomarker Datasets v2.0 (DTA).zip"), "zamphia2016adultbio.dta") | |
zwe2016phia <- rdhs::read_zipdata(file.path(phia_path, "ZWE/datasets/302_ZIMPHIA 2015-2016 Adult Biomarker Dataset (DTA).zip")) | |
nga2021phia <- haven::read_dta("~/Data/household surveys/Nigeria/NAIIS/public/datasets/NAIIS2018adultbio.dta") | |
#' ## PHIA2 | |
mwi2020phia <- haven::read_dta(unz(file.path(phia_path, "MWI2020PHIA/datasets/MPHIA 2020-2021 Household Interview and Biomarker Data (DTA).zip"), "mphia2020adultbio.dta")) | |
lso2020phia <- haven::read_dta(unz(file.path(phia_path, "LSO2020PHIA/datasets/LePHIA 2020 Household (DTA).zip"), "lephia2020adultbio.dta")) | |
moz2021phia <- haven::read_dta(unz(file.path(phia_path, "MOZ2021PHIA/datasets/INSIDA 2021 Household Interview and Biomarker Datasets (DTA).zip"), "insida2021adultbio.dta")) | |
swz2021phia <- haven::read_dta(unz(file.path(phia_path, "SWZ2021PHIA/datasets/SHIMS3 2021 Household Interview and Biomarker Datasets (DTA).zip"), "shims32021adultbio.dta")) | |
zwe2020phia <- haven::read_dta(unz(file.path(phia_path, "ZWE2020PHIA/datasets/ZIMPHIA 2020 Household (DTA).zip"), "zimphia2020adultbio.dta")) | |
zmb2021phia <- haven::read_dta(unz(file.path(phia_path, "ZMB2021PHIA/ZAMPHIA 2021 PU Package.zip"), "zamphia2021adultbio.dta")) | |
bwa2021phia <- haven::read_dta("~/Data/household surveys/Botswana/BAIS V/datasets/baisv2021adultbio.dta.zip") | |
nga2021phia$centroidid <- sprintf("NG%04d%05d", nga2021phia$varstrat, nga2021phia$varunit) | |
datl <- grep("[a-z]{3}20[0-9]{2}phia$", ls(), value = TRUE) %>% | |
{setNames(lapply(., get), toupper(.))} | |
dat <- datl %>% | |
lapply(select, any_of(c("country", "householdid", "personid", "centroidid", "gender", "age", "hivstatusfinal", "recentlagvl", "recentlagvlarv", "btwt0"))) %>% | |
lapply(haven::zap_labels) %>% | |
Map(mutate, ., survey_id = names(.)) %>% | |
bind_rows() | |
phia_info <- tribble( | |
~survey_id, ~iso3, ~country, ~survey_year_label, ~survey_year, ~phia_phase, | |
"CMR2017PHIA", "CMR", "Cameroon", "2017", 2017, "PHIA1", | |
"CIV2018PHIA", "CIV", "Cote d'Ivoire", "2017-2018", 2018, "PHIA1", | |
"SWZ2017PHIA", "SWZ", "Eswatini", "2016-2017", 2017, "PHIA1", | |
"ETH2018PHIA", "ETH", "Ethiopia", "2017-2018", 2018, "PHIA1", | |
"KEN2018PHIA", "KEN", "Kenya", "2018", 2018, "PHIA1", | |
"LSO2017PHIA", "LSO", "Lesotho", "2016-2017", 2017, "PHIA1", | |
"MWI2016PHIA", "MWI", "Malawi", "2015-2016", 2016, "PHIA1", | |
"NAM2017PHIA", "NAM", "Namibia", "2017", 2017, "PHIA1", | |
"RWA2019PHIA", "RWA", "Rwanda", "2018-2019", 2019, "PHIA1", | |
"TZA2017PHIA", "TZA", "Tanzania", "2016-2017", 2017, "PHIA1", | |
"UGA2017PHIA", "UGA", "Uganda", "2016-2017", 2017, "PHIA1", | |
"ZMB2016PHIA", "ZMB", "Zambia", "2016", 2016, "PHIA1", | |
"ZWE2016PHIA", "ZWE", "Zimbabwe", "2015-2016", 2016, "PHIA1", | |
"NGA2021PHIA", "NGA", "Nigeria", "2018", 2018, "PHIA1", | |
## | |
"LSO2020PHIA", "LSO", "Lesotho", "2020", 2020, "PHIA2", | |
"MOZ2021PHIA", "MOZ", "Mozambique", "2021", 2021, "PHIA2", | |
"MWI2020PHIA", "MWI", "Malawi", "2020", 2020, "PHIA2", | |
"SWZ2021PHIA", "SWZ", "Eswatini", "2021", 2021, "PHIA2", | |
"ZWE2020PHIA", "ZWE", "Zimbabwe", "2020", 2020, "PHIA2", | |
"ZMB2021PHIA", "ZMB", "Zambia", "2021", 2021, "PHIA2", | |
"BWA2021PHIA", "BWA", "Botswana", "2021", 2021, "PHIA2" | |
) | |
dat <- dat %>% | |
select(-country) %>% | |
left_join(phia_info) | |
dat %>% | |
count(survey_id, hivstatusfinal) %>% | |
spread(hivstatusfinal, n) %>% | |
print(n = Inf) | |
dat <- dat %>% | |
mutate( | |
hivstatus | |
dat %>% | |
count(survey_id, recentlagvlarv) %>% | |
spread(recentlagvlarv, n) %>% | |
print(n = Inf) | |
#' Recode the recent infection status as quadrinomial outcome: | |
#' "0. negative": HIV negative | |
#' "1. recent": HIV positive and recent HIV infection | |
#' "2. long-term": HIV positive and long-term infection (not recent) | |
#' "9. not tested": HIV positive and no recent infection available (assumed missing-at-random) | |
#' | |
#' It must be a **factor variable with four levels** in this order. | |
#' The value labels are not important, but the order is important. | |
#' | |
dat <- dat %>% | |
mutate( | |
recent_prev = case_when(hivstatusfinal == 2 ~ "0. negative", | |
hivstatusfinal == 1 & recentlagvlarv == 1 ~ "1. recent", | |
hivstatusfinal == 1 & recentlagvlarv == 2 ~ "2. long-term", | |
hivstatusfinal == 1 & is.na(recentlagvlarv) | | |
!recentlagvlarv %in% 1:2 ~ "9. not tested"), | |
recent_prev = factor(recent_prev, | |
levels = c("0. negative", "1. recent", "2. long-term", "9. not tested")) | |
) | |
dat %>% | |
filter(btwt0 > 0) %>% | |
count(survey_id, recent_prev) %>% | |
spread(recent_prev, n) %>% | |
print(n = Inf) | |
dat <- dat %>% | |
filter(!is.na(btwt0) & btwt0 > 0) | |
dat <- dat %>% | |
mutate( | |
sex = recode(gender, `1` = "male", `2` = "female"), | |
age_group = naomi::cut_naomi_age_group(age), | |
age_group10 = age_group %>% | |
fct_collapse("Y015_024" = c("Y015_019", "Y020_024"), | |
"Y025_034" = c("Y025_029", "Y030_034"), | |
"Y035_044" = c("Y035_039", "Y040_044"), | |
"Y045_054" = c("Y045_049", "Y050_054"), | |
"Y055_064" = c("Y055_059", "Y060_064"), | |
other_level = NA), | |
age_15to49 = if_else(age %in% 15:49, "Y015_049", NA_character_), | |
age_15to64 = if_else(age %in% 15:64, "Y015_064", NA_character_) | |
) | |
#' Split datasets by survey because some surveys use different MDRI and FRR | |
datspl <- dat %>% | |
split(.$survey_id) | |
#' Create standard survey::survey.design object | |
desspl <- datspl %>% | |
lapply(\(x) svydesign(ids = ~centroidid, weights = ~btwt0, nest = TRUE, data = x)) | |
#' Use the function svyprevincid() to calculate the HIV incidence and prevalence. | |
#' | |
#' This function uses the delta method to also account for uncertainty about the | |
#' mean duration of recent infection (MDRI) and false recent ratio (FRR). | |
#' | |
#' Some notes on the function: | |
#' | |
#' * The function returns a `svystat` object, the standard return from the survey package. | |
#' This allows standard specification for reporting out CIs, design effects, etc. | |
#' | |
#' * You can use it with `svyby()` to calculate incidence and prevalence stratified by | |
#' categorical variables, and get it to return 95% CIs, design effect, etc. | |
#' | |
#' * It takes as arguments the MDRI [in years] and FRR and the standard errors for | |
#' each of these. | |
#' - For PHIA surveys, the MDRI was assumed to be 135 days (= 0.370 years) | |
#' with 95% CI 118-142 days (SE = 6.12 days = 0.017 years) | |
#' id__EXCEPT for Uganda, which used 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036). | |
#' - For PHIA surveys, the FRR was assumed to be 0.0 with SE = 0.0 | |
#' | |
#' * It does not allow NA values in the recent infection variable (it does not have an | |
#' `na.rm = TRUE` argument), so ensure to subset to remove NA calculation. | |
#' | |
#' To calculate incidence and prevalence among adults 15-49 years (reproduces | |
#' MPHIA 2015-16 summary sheet) | |
svyprevincid(~recent_prev, | |
design = subset(desspl$MWI2016PHIA, age %in% 15:49), | |
frr = 0.0, se.frr = 0.0, deff = TRUE) | |
#' Incidence by sex using svyby(): | |
svyby(~recent_prev, ~sex, subset(desspl$MWI2016PHIA, age %in% 15:49), svyprevincid, | |
frr = 0.0, se.frr = 0.0, deff = TRUE) | |
#' # Calculate incidence by sex and age categories | |
mdri <- setNames(rep(130 / 365, length(desspl)), names(desspl)) | |
se.mdri <- mdri | |
se.mdri[] <- (142 - 118)/365/(2 * qnorm(0.975)) | |
#' Uganda PHIA MDRI: 153 days (95% CI 127-178 days) = 0.42 years (SE = 0.036) | |
mdri["UGA2017PHIA"] <- 153 / 365 | |
se.mdri["UGA2017PHIA"] <- (178 - 127)/365/(2 * qnorm(0.975)) | |
incid_sex_age_group <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + sex + age_group, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() | |
incid_age_group <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + age_group, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
mutate(sex = "both") | |
incid_sex_age_group10 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + sex + age_group10, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
rename(age_group = age_group10) | |
incid_age_group10 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + age_group10, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
mutate(sex = "both") %>% | |
rename(age_group = age_group10) | |
incid_sex_age_15to49 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + sex + age_15to49, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
rename(age_group = age_15to49) | |
incid_age_15to49 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + age_15to49, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
mutate(sex = "both") %>% | |
rename(age_group = age_15to49) | |
incid_sex_age_15to64 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + sex + age_15to64, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
rename(age_group = age_15to64) | |
incid_age_15to64 <- Map(\(des, mdri, se.mdri) | |
svyby(~recent_prev, ~survey_id + age_15to64, | |
des, svyprevincid, | |
mdri = mdri, se.mdri = se.mdri, | |
frr = 0.0, se.frr = 0.0, deff = TRUE), | |
desspl, mdri[names(desspl)], se.mdri[names(desspl)]) %>% | |
bind_rows() %>% | |
mutate(sex = "both") %>% | |
rename(age_group = age_15to64) | |
incid <- bind_rows(incid_sex_age_15to49, incid_age_15to49, | |
incid_sex_age_15to64, incid_age_15to64, | |
incid_sex_age_group10, incid_age_group10, | |
incid_sex_age_group, incid_age_group) | |
#' Add 95% CI on log scale (avoid ci_lower < 0)) | |
incid <- incid %>% | |
mutate( | |
se.log_incid = se.incid / incid, | |
ci_l.incid = exp(log(incid) - qnorm(0.975) * se.log_incid), | |
ci_u.incid = exp(log(incid) + qnorm(0.975) * se.log_incid), | |
se.log_incid = NULL | |
) | |
incid <- right_join(phia_info, incid) %>% | |
as_tibble() | |
#' Add raw counts | |
#' | |
#' Assuming FRR = 0 and all HIV positive are tested for recent infection, | |
#' the incidence calculation becomes: [incidence] = [# recent] / (MDRI * [# negative]). | |
#' | |
#' Assuming no uncertaity in MDRI, the Poisson approximation becomes: | |
#' [# recent] ~ Poisson( [incidence] * MDRI * [# negative]) | |
#' | |
#' -> Use: | |
#' - n_pys_approx = MDRI * [# negative] as an approximation to PYs | |
#' - x_recent_approx = incidence * n_pys_approx | |
#' | |
#' The approximation ignores uncertainty from: | |
#' 1. Design effects (clustering and unequal sample weights) | |
#' 2. Proportion of HIV positive not tested for recent infection | |
#' 3. Uncertainty about MDRI | |
#' | |
#' Number 1 could be partially accounted for with Kish effective sample size (as | |
#' with other Naomi inputs) and number 2 through deflating the number negative | |
#' denominator by the proportion of positive respondents tested for HIV. | |
#' In practice proportion missing recent is pretty small in PHIA surveys, so | |
#' unlikely to be a major difference. | |
#' | |
dat <- dat %>% | |
mutate( | |
recent_prev_label = recent_prev %>% | |
recode("0. negative" = "n_negative", | |
"1. recent" = "n_pos_recent", | |
"2. long-term" = "n_pos_longterm", | |
"9. not tested" = "n_pos_nottest") | |
) | |
counts_sex_age_group <- dat %>% | |
count(survey_id, sex, age_group, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_age_group <- dat %>% | |
count(survey_id, sex = "both", age_group, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_sex_age_group10 <- dat %>% | |
count(survey_id, sex, age_group = age_group10, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_age_group10 <- dat %>% | |
count(survey_id, sex = "both", age_group = age_group10, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_sex_age_15to49 <- dat %>% | |
count(survey_id, sex, age_group = age_15to49, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_age_15to49 <- dat %>% | |
count(survey_id, sex = "both", age_group = age_15to49, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_sex_age_15to64 <- dat %>% | |
count(survey_id, sex, age_group = age_15to64, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_age_15to64 <- dat %>% | |
count(survey_id, sex = "both", age_group = age_15to64, recent_prev_label) %>% | |
pivot_wider(names_from = recent_prev_label, values_from = n, values_fill = 0) | |
counts_raw <- bind_rows(counts_sex_age_15to49, counts_age_15to49, | |
counts_sex_age_15to64, counts_age_15to64, | |
counts_sex_age_group10, counts_age_group10, | |
counts_sex_age_group, counts_age_group) %>% | |
filter(!is.na(age_group)) | |
counts_raw <- counts_raw %>% | |
mutate( | |
mdri = mdri[survey_id], | |
mdri_se = se.mdri[survey_id] | |
) | |
incid <- incid %>% | |
left_join(counts_raw, by = join_by(survey_id, sex, age_group)) %>% | |
mutate( | |
n_pys_approx = mdri * n_negative, | |
x_recent_approx = incid * n_pys_approx | |
) | |
library(ggplot2) | |
incid %>% | |
filter(age_group == "Y015_049") %>% | |
mutate( | |
pys_eff = incid / se.incid^2, | |
pys_deff_ratio = n_pys_approx / pys_eff, | |
pys_ratio_mean = mean(pys_deff_ratio, na.rm = TRUE), | |
.by = sex | |
) %>% | |
ggplot(aes(survey_id, pys_deff_ratio)) + | |
geom_point() + | |
geom_hline(yintercept = 1.0, linetype = "dashed") + | |
geom_hline(aes(yintercept = pys_ratio_mean), color = "blue") + | |
facet_wrap(~sex) + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + | |
labs(title = "Ratio of approx PYs to standard-error derived PYs: age 15-49") | |
write_csv(incid, "phia-incid-extract_2024-05-01.csv", na = "") |
Author
jeffeaton
commented
May 1, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment