Last active
September 6, 2021 03:25
-
-
Save goldingn/85d2b7363d2da2888254bb6042144150 to your computer and use it in GitHub Desktop.
building synthetic contact matrices with generalised additive models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# build synthetic age-structured contact matrices with GAMs | |
library(tidyverse) | |
library(mgcv) | |
library(patchwork) | |
library(socialmixr) | |
# return the polymod-average population age distribution in 5y | |
# increments (weight country population distributions by number of participants) | |
# note that we don't want to weight by survey age distributions for this, since | |
# the total number of *participants* represents the sampling | |
get_polymod_population <- function() { | |
polymod$participants %>% | |
filter( | |
!is.na(year) | |
) %>% | |
group_by( | |
country, | |
year | |
) %>% | |
summarise( | |
participants = n(), | |
.groups = "drop" | |
) %>% | |
left_join( | |
wpp_age(), | |
by = c("country", "year") | |
) %>% | |
filter( | |
!is.na(lower.age.limit) | |
) %>% | |
group_by( | |
lower.age.limit | |
) %>% | |
summarise( | |
population = weighted.mean(population, participants) | |
) | |
} | |
# apply a 45 degree rotation to age contact pairs, to model trends along the | |
# diagonal, and perpendicularly across the diagonal | |
add_rotated_ages <- function(contact_data) { | |
theta <- pi / 4 | |
contact_data %>% | |
mutate( | |
along_diagonal = age_from * cos(theta) + age_to * cos(theta), | |
across_diagonal = age_from * cos(theta) - age_to * sin(theta) | |
) | |
} | |
# add fractions of the population in each age group that attend school/work | |
# (average FTE), to compute the probability that both participant and contact | |
# attend school/work | |
add_school_work_participation <- function(contact_data) { | |
contact_data %>% | |
mutate( | |
across( | |
starts_with("age"), | |
.fns = list( | |
# made up example - replace with education statistics | |
school_fraction = ~ case_when( | |
# preschool | |
.x %in% 2:4 ~ 0.5, | |
# compulsory education | |
.x %in% 5:16 ~ 1, | |
# voluntary education | |
.x %in% 17:18 ~ 0.5, | |
# university | |
.x %in% 19:25 ~ 0.1, | |
# other | |
TRUE ~ 0.05 | |
), | |
# made up example - replace with labour force statistics | |
work_fraction = ~ case_when( | |
# child labour | |
.x %in% 12:19 ~ 0.2, | |
# young adults (not at school) | |
.x %in% 20:24 ~ 0.7, | |
# main workforce | |
.x %in% 25:60 ~ 1, | |
# possibly retired | |
.x %in% 61:65 ~ 0.7, | |
# other | |
TRUE ~ 0.05 | |
) | |
), | |
.names = "{.fn}_{.col}" | |
), | |
# the probabilities that both parties go to school/work. May not be the same | |
# place. But proportional to the increase in contacts due to attendance | |
school_probability = school_fraction_age_from * school_fraction_age_to, | |
work_probability = work_fraction_age_from * work_fraction_age_to, | |
) | |
} | |
# return an interpolating function to get populations in 1y age increments from | |
# chunkier distributions produced by socialmixr::wpp_age() (must contain `lower.age.limit` and `population`) | |
get_age_population_function <- function(population) { | |
population <- population %>% | |
arrange(lower.age.limit) | |
# compute the widths of age bins | |
bin_widths <- diff(population$lower.age.limit) | |
final_bin_width <- bin_widths[length(bin_widths)] | |
bin_widths <- c(bin_widths, final_bin_width) | |
# range of ages (assume the final bin width is that same as the previous one, | |
# since we cannot extrapolate the infinite upper bound without known the upper | |
# age limit) | |
min_age <- min(population$lower.age.limit) | |
max_age <- max(population$lower.age.limit) + final_bin_width | |
# interpolator to 1y age groups up to 100 | |
spline <- splinefun( | |
x = population$lower.age.limit, | |
y = population$population / bin_widths, | |
) | |
# wrap this up in a function to handle values outside this range | |
function(age) { | |
population <- spline(age) | |
invalid <- age < min(age) | age > max_age | |
population[invalid] <- 0 | |
pmax(population, 0) | |
} | |
} | |
# add the population distribution for contact ages. If 'polymod' then use the | |
# participant-weighted average of polymod country/year distributions | |
add_population_age_to <- function(contact_data, population = get_polymod_population()) { | |
# get function to interpolate population age distributions to 1y bins | |
age_population_function <- get_age_population_function(population) | |
# add the population in each 'to' age for the survey context | |
contact_data %>% | |
mutate( | |
pop_age_to = age_population_function(age_to) | |
) | |
} | |
# add features required for modelling to the dataset | |
add_modelling_features <- function(contact_data, ...) { | |
contact_data %>% | |
add_population_age_to(...) %>% | |
add_school_work_participation() %>% | |
add_rotated_ages() | |
} | |
# format polymod data and filter contacts to certain settings | |
get_polymod_contact_data <- function( | |
setting = c("all", "home", "work", "school", "other"), | |
ages = 0:100, | |
contact_age_imputation = c("sample", "mean", "remove_participant") | |
) { | |
setting <- match.arg(setting) | |
contact_age_imputation <- match.arg(contact_age_imputation) | |
contact_data <- polymod$participants %>% | |
left_join( | |
polymod$contacts, | |
by = "part_id" | |
) | |
# impute contact ages according to the required method | |
contact_data_imputed <- contact_data %>% | |
mutate( | |
cnt_age_sampled = floor( | |
# suppress warnings about NAs in runif | |
suppressWarnings( | |
runif( | |
n = n(), | |
min = cnt_age_est_min, | |
max = cnt_age_est_max + 1 | |
) | |
) | |
), | |
cnt_age_mean = floor( | |
cnt_age_est_min + (cnt_age_est_max + 1 - cnt_age_est_min) / 2 | |
), | |
cnt_age = case_when( | |
!is.na(cnt_age_exact) ~ as.numeric(cnt_age_exact), | |
contact_age_imputation == "sample" ~ cnt_age_sampled, | |
contact_age_imputation == "mean" ~ cnt_age_mean, | |
TRUE ~ NA_real_ | |
), | |
) | |
# filter out any participants with missing contact ages or settings (can't | |
# just remove the contacts as that will bias the count) | |
contact_data_filtered <- contact_data_imputed %>% | |
group_by(part_id) %>% | |
mutate( | |
missing_any_contact_age = any(is.na(cnt_age)), | |
missing_any_contact_setting = any( | |
is.na(cnt_home) | | |
is.na(cnt_work) | | |
is.na(cnt_school) | | |
is.na(cnt_transport) | | |
is.na(cnt_leisure) | | |
is.na(cnt_otherplace) | |
) | |
) %>% | |
ungroup() %>% | |
filter( | |
!is.na(part_age), | |
!missing_any_contact_age, | |
!missing_any_contact_setting | |
) | |
# get contacts by setting (keeping 0s, so we can record 0 contacts for some individuals) | |
contact_data_setting <- contact_data_filtered %>% | |
mutate( | |
contacted = case_when( | |
setting == "all" ~ 1L, | |
setting == "home" ~ cnt_home, | |
setting == "school" ~ cnt_school, | |
setting == "work" ~ cnt_work, | |
setting == "other" ~ pmax(cnt_transport, cnt_leisure, cnt_otherplace), | |
) | |
) | |
# collapse down number of contacts per participant and contact age | |
contact_data_setting %>% | |
select( | |
part_id, | |
age_from = part_age, | |
age_to = cnt_age, | |
contacted | |
) %>% | |
complete( | |
nesting(age_from, part_id), | |
age_to = ages, | |
fill = list(contacted = 0) | |
) %>% | |
group_by( | |
age_from, | |
age_to | |
) %>% | |
summarise( | |
contacts = sum(contacted), | |
participants = n_distinct(part_id), | |
.groups = "drop" | |
) | |
} | |
# predict contacts to a given population at full 1y resolution | |
predict_contacts_1y <- function(model, population, age_min = 0, age_max = 100) { | |
all_ages <- age_min:age_max | |
# predict contacts to all integer years, adjusting for the population in a given place | |
expand_grid( | |
age_from = all_ages, | |
age_to = all_ages, | |
) %>% | |
# add on prediction features, setting the population to predict to | |
add_modelling_features( | |
population = population | |
) %>% | |
mutate( | |
# prediction | |
contacts = predict( | |
model, | |
newdata = ., | |
type = "response" | |
), | |
# uncertainty | |
se_contacts = predict( | |
model, | |
newdata = ., | |
type = "response", | |
se.fit = TRUE | |
)$se.fit | |
) %>% | |
select( | |
age_from, | |
age_to, | |
contacts, | |
se_contacts | |
) | |
} | |
# aggregate predicted contacts from complete 1y resolution to a stated resolution | |
# must pass in the population to do approppriate weighting of 'from' age groups | |
aggregate_predicted_contacts <- function( | |
predicted_contacts_1y, | |
population, | |
age_breaks = c( | |
seq(0, 75, by = 5), | |
Inf | |
) | |
) { | |
# get function for 1y age populations in this country | |
age_population_function <- get_age_population_function(population) | |
# aggregate contacts within age breaks | |
predicted_contacts_1y %>% | |
mutate( | |
age_group_to = cut( | |
pmax(0.1, age_to), | |
age_breaks | |
) | |
) %>% | |
filter( | |
!is.na(age_group_to) | |
) %>% | |
# sum the number of contacts to the 'to' age groups, for each integer | |
# participant age | |
group_by( | |
age_from, | |
age_group_to | |
) %>% | |
summarise( | |
contacts = sum(contacts), | |
.groups = "drop" | |
) %>% | |
# *average* the total contacts within the 'from' contacts, weighted by the | |
# population distribution (to get contacts for the population-average ember of | |
# that age group) | |
mutate( | |
pop_age_from = age_population_function(age_from), | |
age_group_from = cut( | |
pmax(0.1, age_from), | |
age_breaks | |
) | |
) %>% | |
filter( | |
!is.na(age_group_from) | |
) %>% | |
group_by( | |
age_group_from, | |
age_group_to | |
) %>% | |
summarise( | |
contacts = weighted.mean(contacts, pop_age_from), | |
.groups = "drop" | |
) | |
} | |
# fit a single GAM contact model to a dataset | |
fit_single_contact_model <- function(contact_data, population) { | |
# contact model for all locations together | |
contact_data %>% | |
add_modelling_features( | |
population = population | |
) %>% | |
bam( | |
contacts ~ | |
# multiplicative offset for population in the 'to' age group, to account for | |
# opportunity to contact | |
offset(log(pop_age_to)) + | |
# deviation of contact age distribution from population age distribution | |
s(age_to) + | |
# number of contacts by age | |
s(age_from) + | |
# intergenerational contact patterns | |
s(abs(age_from - age_to)) + | |
# probabilities of both attending (any) school/work | |
school_probability + | |
work_probability, | |
family = stats::poisson, | |
# add number of participants as a multilpicative offset here rather than in | |
# the formula, so it is not needed for prediction, | |
offset = log(participants), | |
data = . | |
) | |
} | |
# predict number of contacts in all combinations of the age groups specified by | |
# age_breaks, given a model and a population distribution of the time/place to | |
# predict to | |
predict_contacts <- function ( | |
model, | |
population, | |
age_breaks = c(seq(0, 75, by = 5), Inf) | |
) { | |
population <- population %>% | |
arrange(lower.age.limit) | |
age_min_integration <- min(population$lower.age.limit) | |
bin_widths <- diff(population$lower.age.limit) | |
final_bin_width <- bin_widths[length(bin_widths)] | |
age_max_integration <- max(population$lower.age.limit) + final_bin_width | |
pred_1y <- predict_contacts_1y( | |
model = model, | |
population = population, | |
age_min = age_min_integration, | |
age_max = age_max_integration | |
) | |
pred_groups <- aggregate_predicted_contacts( | |
predicted_contacts_1y = pred_1y, | |
population = population, | |
age_breaks = age_breaks | |
) | |
pred_groups | |
} | |
predictions_to_matrix <- function(contact_predictions) { | |
contact_predictions %>% | |
pivot_wider( | |
names_from = age_group_to, | |
values_from = contacts | |
) %>% | |
column_to_rownames( | |
"age_group_from" | |
) %>% | |
as.matrix() | |
} | |
# given a named list of contact datasets (with names giving the setting, and | |
# assumed to together make up the full set of contacts for individuals in the | |
# survey), a representative population distribution for the survey, and a set of | |
# age breaks at which to aggregate contacts, return a set of predicted contact | |
# matrices for each setting, and for all combined. | |
estimate_setting_contacts <- function( | |
contact_data_list, | |
survey_population, | |
prediction_population = survey_population, | |
age_breaks | |
) { | |
setting_models <- lapply( | |
X = contact_data_list, | |
FUN = fit_single_contact_model, | |
population = survey_population | |
) | |
setting_predictions <- lapply( | |
X = setting_models, | |
FUN = predict_contacts, | |
population = prediction_population, | |
age_breaks = age_breaks | |
) | |
setting_matrices <- lapply( | |
X = setting_predictions, | |
FUN = predictions_to_matrix | |
) | |
combination <- Reduce("+", setting_matrices) | |
setting_matrices$all <- combination | |
setting_matrices | |
} | |
# use ggplot to plot a matrix in the output format | |
plot_matrix <- function(matrix) { | |
matrix %>% | |
matrix_to_predictions() %>% | |
ggplot( | |
aes( | |
x = age_group_from, | |
y = age_group_to, | |
fill = contacts | |
) | |
) + | |
geom_tile() + | |
coord_fixed() + | |
scale_fill_distiller( | |
direction = 1, | |
trans = "sqrt" | |
) + | |
theme_minimal() + | |
theme( | |
axis.text = element_text( | |
size = 6, | |
angle = 45, | |
hjust = 1 | |
) | |
) | |
} | |
# convert a contact matrix as output by these functions (or socialmixr) into a | |
# long-form tibble | |
matrix_to_predictions <- function(contact_matrix) { | |
contact_matrix %>% | |
as_tibble( | |
rownames = "age_group_from" | |
) %>% | |
pivot_longer( | |
cols = -c(age_group_from), | |
names_to = "age_group_to", | |
values_to = "contacts" | |
) %>% | |
mutate( | |
across( | |
starts_with("age_group"), | |
~factor(.x, levels = unique(.x)) | |
) | |
) | |
} | |
# fit all-of-polymod model and extrapolate to a given population an age breaks | |
extrapolate_polymod <- function( | |
population, | |
age_breaks = c(seq(0, 75, by = 5), Inf) | |
) { | |
estimate_setting_contacts( | |
contact_data_list = get_polymod_setting_data(), | |
survey_population = get_polymod_population(), | |
prediction_population = population, | |
age_breaks = age_breaks | |
) | |
} | |
get_polymod_setting_data <- function() { | |
list( | |
home = get_polymod_contact_data("home"), | |
work = get_polymod_contact_data("work"), | |
school = get_polymod_contact_data("school"), | |
other = get_polymod_contact_data("other") | |
) | |
} | |
plot_setting_matrices <- function( | |
matrices, | |
title = "Setting-specific synthetic contact matrices (all polymod data)" | |
) { | |
plot_matrix(matrices$home) + | |
ggtitle("home") + | |
plot_matrix(matrices$school) + | |
ggtitle("school") + | |
plot_matrix(matrices$work) + | |
ggtitle("work") + | |
plot_matrix(matrices$other) + | |
ggtitle("other") + | |
plot_layout(ncol = 2) + | |
plot_annotation( | |
title = title | |
) | |
} | |
# analysis of polymod data | |
# set age breaks | |
age_breaks_5y <- c(seq(0, 75, by = 5), Inf) | |
age_breaks_1y <- c(seq(0, 100, by = 1), Inf) | |
# fit a single overall contact model to polymod | |
m_all <- fit_single_contact_model( | |
contact_data = get_polymod_contact_data("all"), | |
population = get_polymod_population() | |
) | |
# predict contacts at 1y and 5y resolutions for inspection | |
synthetic_all_5y <- predict_contacts( | |
model = m_all, | |
population = get_polymod_population(), | |
age_breaks = age_breaks_5y | |
) %>% | |
predictions_to_matrix() | |
synthetic_all_1y <- predict_contacts( | |
model = m_all, | |
population = get_polymod_population(), | |
age_breaks = age_breaks_1y | |
) %>% | |
predictions_to_matrix() | |
# compute setting-specific and combined age matrices for polymod | |
synthetic_settings_5y_polymod <- extrapolate_polymod( | |
population = get_polymod_population() | |
) | |
# extrapolate to other contexts | |
synthetic_settings_5y_italy_2005 <- extrapolate_polymod( | |
population = wpp_age("Italy", "2005") | |
) | |
synthetic_settings_5y_germany_2005 <- extrapolate_polymod( | |
population = wpp_age("Germany", "2005") | |
) | |
synthetic_settings_5y_bolivia_2015 <- extrapolate_polymod( | |
population = wpp_age("Bolivia", "2015") | |
) | |
synthetic_settings_5y_ghana_2015 <- extrapolate_polymod( | |
population = wpp_age("Ghana", "2015") | |
) | |
# empirical 5y polymod matrix | |
empirical_all_5y <- contact_matrix( | |
survey = polymod, | |
age.limits = seq(0, 75, by = 5), | |
symmetric = FALSE, | |
split = FALSE, | |
missing.participant.age = "remove", | |
missing.contact.age = "remove" | |
)$matrix | |
# plot setting-specific matrices | |
plot_setting_matrices(synthetic_settings_5y_polymod) | |
ggsave( | |
"~/Desktop/synthetic_setting_specific_polymod.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
plot_setting_matrices( | |
synthetic_settings_5y_italy_2005, | |
title = "Setting-specific synthetic contact matrices (Italy 2005 projected)" | |
) | |
ggsave( | |
"~/Desktop/synthetic_setting_specific_italy.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
plot_setting_matrices( | |
synthetic_settings_5y_germany_2005, | |
title = "Setting-specific synthetic contact matrices (Germany 2005 projected)" | |
) | |
ggsave( | |
"~/Desktop/synthetic_setting_specific_germany.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
plot_setting_matrices( | |
synthetic_settings_5y_bolivia_2015, | |
title = "Setting-specific synthetic contact matrices (Bolivia 2015 projected)" | |
) | |
ggsave( | |
"~/Desktop/synthetic_setting_specific_bolivia.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
plot_setting_matrices( | |
synthetic_settings_5y_ghana_2015, | |
title = "Setting-specific synthetic contact matrices (Ghana 2015 projected)" | |
) | |
ggsave( | |
"~/Desktop/synthetic_setting_specific_ghana.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
# plot empirical vs synthetic matrices | |
plot_matrix(empirical_all_5y) + | |
ggtitle("empirical (socialmixr)") + | |
plot_matrix(synthetic_settings_5y_polymod$all) + | |
ggtitle("synthetic by setting, combined") + | |
plot_matrix(synthetic_all_5y) + | |
ggtitle("synthetic all at once") + | |
plot_matrix(synthetic_all_1y) + | |
ggtitle("synthetic all at once (1y)") + | |
theme(axis.text = element_blank()) + | |
plot_layout(ncol = 2) + | |
plot_annotation( | |
title = "Empirical vs synthetic matrices (all polymod data)" | |
) | |
ggsave( | |
"~/Desktop/empirical_vs_synthetic.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) | |
# visualise empirical contact rate estimates | |
bind_rows( | |
home = get_polymod_contact_data("home"), | |
school = get_polymod_contact_data("school"), | |
work = get_polymod_contact_data("work"), | |
other = get_polymod_contact_data("other"), | |
.id = "setting" | |
) %>% | |
mutate( | |
rate = contacts / participants, | |
setting = factor( | |
setting, | |
levels = c( | |
"home", "school", "work", "other" | |
) | |
) | |
) %>% | |
group_by( | |
setting | |
) %>% | |
mutate( | |
`relative contact rate` = rate / max(rate) | |
) %>% | |
ungroup() %>% | |
ggplot( | |
aes( | |
x = age_from, | |
y = age_to, | |
fill = `relative contact rate` | |
) | |
) + | |
facet_wrap( | |
~ setting, | |
ncol = 2, | |
scales = "free" | |
) + | |
geom_tile() + | |
scale_fill_distiller( | |
direction = 1, | |
trans = "sqrt" | |
) + | |
theme_minimal() | |
ggsave( | |
"~/Desktop/polymod_raw_settings.png", | |
width = 9, | |
height = 8, | |
bg = "white" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment