Skip to content

Instantly share code, notes, and snippets.

@USMortality
Last active February 27, 2025 03:13
Show Gist options
  • Save USMortality/d1f3b7d1785ef9b1f42358dcba232af4 to your computer and use it in GitHub Desktop.
Save USMortality/d1f3b7d1785ef9b1f42358dcba232af4 to your computer and use it in GitHub Desktop.
Age-Standardized Mortality Rate (ASMR) [Germany]
library(tidyverse)
library(scales)
library(zoo)
library(ggrepel)
sf <- 2
width <- 600 * sf
height <- 335 * sf
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))
parse_age_groups <- function(df) {
df$age_group <- sub("unter 1 Jahr", "0", df$age_group)
df$age_group <- sub("85 Jahre und mehr", "85+", df$age_group)
df$age_group <- sub("Insgesamt", "all", df$age_group)
df$age_group <- sub("-Jährige", "", df$age_group)
df$age_group <- sub("100 Jahre und mehr", "100+", df$age_group)
df
}
summarize_age_groups <- function(df, col, target_ag) {
generate_age_group <- function(age) {
# Return NA if input is NA
if (is.na(age)) {
return(NA_character_)
}
# Convert age to character and remove "+" if present
age_char <- as.character(age)
age_num <- as.numeric(sub("\\+", "", age_char))
# Check if conversion to numeric failed
if (is.na(age_num)) {
return(NA_character_)
}
# Iterate through age groups
for (range in target_ag) {
if (grepl("\\+$", range)) {
bound <- as.numeric(sub("\\+", "", range))
if (age_num >= bound) {
return(range)
}
} else if (grepl("-", range)) {
bounds <- as.numeric(unlist(strsplit(range, "-")))
if (age_num >= bounds[1] && age_num <= bounds[2]) {
return(range)
}
} else {
bound <- as.numeric(range)
if (age_num == bound) {
return(range)
}
}
}
# Return NA if no match found
return(NA_character_)
}
df |>
mutate(
age_group = case_when(
age_group == "all" ~ "all",
TRUE ~ sapply(age_group, generate_age_group)
)
) |>
group_by(date, age_group) |>
summarize({{ col }} := sum(.data[[col]], na.rm = TRUE), .groups = "drop") |>
filter(!is.na(age_group))
}
days_in_year <- function(year) {
start_date <- ymd(paste0(year, "-01-01"))
end_date <- ymd(paste0(year, "-12-31"))
as.integer(difftime(end_date, start_date, units = "days")) + 1
}
interpolate_daily <- function(data, col, ref_day) {
data <- data |>
mutate(date = as.Date(paste0(date, "-", ref_day))) |>
group_by(age_group) |>
complete(date = seq(min(date), max(date), by = "day"))
if ("population" %in% names(data)) {
data <- mutate(data, across({{ col }}, ~ na.approx(as.numeric(.),
rule = 2, na.rm = FALSE
)))
}
if ("deaths" %in% names(data)) {
data <- data |>
fill(deaths) |>
mutate(deaths = deaths / days_in_year(year(date)))
}
data
}
ag5 <- c("0-14", "15-64", "65-74", "75-84", "85+")
ag10y <- c(
"0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"
)
# Population
pop <- read_delim(
"https://apify.mortality.watch/destatis-genesis/12411-0005.csv.gz",
delim = ";", skip = 6
)
names(pop)[1] <- "age_group"
names(pop)[-1] <- sub(".*(.{4})$", "\\1", names(pop)[-1])
pop <- pop |>
filter(!is.na(`2000`)) |>
parse_age_groups()
pop <- pop |>
pivot_longer(
cols = !c("age_group"), names_to = "date", values_to = "population"
)
pop$date <- as.integer(pop$date)
pop$population <- as.integer(pop$population)
pop_1y_daily <- pop |> interpolate_daily("population", ref_day = "12-31")
# Deaths
deaths <- read_delim(
"https://apify.mortality.watch/destatis-genesis/12613-0003.csv.gz",
delim = ";",
skip = 5,
col_types = cols(
.default = col_integer(),
`...1` = col_character(),
`...2` = col_character()
)
)
names(deaths)[1] <- "sex"
names(deaths)[2] <- "age_group"
deaths <- deaths |>
fill(sex, .direction = "down") |>
filter(sex == "Insgesamt") |>
pivot_longer(
cols = !c("sex", "age_group"), names_to = "date", values_to = "deaths"
) |>
mutate(date = as.integer(date)) |>
filter(!is.na(date)) |>
parse_age_groups() |>
summarize_age_groups("deaths", c(as.character(0:84), "85+"))
interpolate_with_future <- function(df) {
last <- df |> filter(date == max(date))
last$date <- as.character(as.numeric(last$date) + 1)
last$deaths <- NA
rbind(df, last) |>
interpolate_daily("deaths", "01-01") |>
filter(date != as.Date(paste0(unique(last$date), "-01-01")))
}
deaths_1y_daily <- deaths |> interpolate_with_future()
calc_asmr <- function(pop, deaths) {
# Combine Daily Deaths/Population
df <- pop |>
inner_join(deaths, by = join_by(age_group, date)) |>
group_by(date, age_group) |>
mutate(cmr = deaths / population * 100000) |>
mutate(date = ifelse(is.Date(date), year(date), date)) |>
group_by(date, age_group) |>
summarize(
population = mean(population), cmr = sum(cmr),
.groups = "drop"
) |>
filter(date > 1970)
# Std Pop 2020
std_pop <- pop |>
filter(date == ifelse(is.Date(date), as.Date("2020-01-01"), 2020)) |>
group_by(date, age_group) |>
summarize(population = sum(population), .groups = "drop") |>
mutate(weight = if_else(
age_group != "all",
population / population[age_group == "all"],
NA_real_
)) |>
filter(age_group != "all") |>
select(age_group, weight)
# ASMR
df |>
filter(date > 1990) |>
inner_join(std_pop, by = join_by(age_group)) |>
group_by(date) |>
summarize(asmr = sum(cmr * weight))
}
asmr <- calc_asmr(pop_1y_daily, deaths_1y_daily)
# Plot
chart <- asmr |> ggplot(aes(x = date, y = asmr)) +
geom_line() +
geom_text_repel(
aes(
label = round(asmr),
vjust = ifelse(date %% 2 == 0, -1, 2)
),
size = 3,
) +
labs(
title = "Age-Standardized Mortality Rate (ASMR) [Germany]",
subtitle = paste0(
"1y, 0-80+ · Std. Pop.: 2020 · Source: Statistical Office ",
"Germany (destatis) · @USMortality"
),
caption = paste0(
"Daily population interpolated from 12/31. Deaths spread daily. ",
"CMR weighted by std population, then aggregated for yearly ASMR."
),
x = "Year",
y = "ASMR"
) +
scale_y_continuous(labels = comma) +
theme_bw()
ggplot2::ggsave(
filename = "chart1.png", plot = chart, width = width, height = height,
units = "px", dpi = 72 * sf, device = grDevices::png, type = c("cairo")
)
# Excess
# Parameters
n <- 3 # Length of short-term average
h <- 4 # Forecast horizon (2017-2020 to 2021-2024)
obs_start_year <- 2020
# Step 1: Filter for the 2017-2020 period to determine the baseline level
baseline_period <- asmr |>
filter(date >= obs_start_year - n & date < obs_start_year) |>
summarise(level = mean(asmr, na.rm = TRUE)) |>
pull(level)
# Step 2: Calculate the variance in percent for each historical
# 3-year + 3-year slice
variance_percent <- function(data, n, h) {
changes <- list()
for (i in 1:(nrow(data) - (n + h) + 1)) {
slice1 <- data$asmr[i:(i + n - 1)]
slice2 <- data$asmr[(i + n):(i + n + h - 1)]
percent_change <- (mean(slice2) - mean(slice1)) / mean(slice1) * 100
changes <- append(changes, percent_change)
}
return(changes)
}
# Calculate historical variances
variances <- variance_percent(asmr |> filter(date < obs_start_year), n, h)
# Step 3: Use the variance to construct the expected interval
expected_interval <- function(level, variances) {
mean_variance <- mean(unlist(variances), na.rm = TRUE)
interval_lower <- level * (1 - mean_variance / 100)
interval_upper <- level * (1 + mean_variance / 100)
return(tibble::tibble(
interval_lower = interval_lower,
interval_upper = interval_upper
))
}
# Step 4: Calculate the forecast interval for 2021-2024 based on the
# 2017-2020 baseline
forecast_result <- expected_interval(baseline_period, variances)
asmr_plot <- asmr |>
filter(date >= 2000 & date <= 2024) |>
mutate(forecast = ifelse(date >= 2020, "Forecast", "Actual"))
chart <- ggplot(asmr_plot, aes(x = date, y = asmr)) +
geom_line() +
geom_text_repel(
aes(
label = round(asmr),
vjust = ifelse(date %% 2 == 0, -1, 2)
),
size = 3,
) +
geom_ribbon(
data = tibble(
date = 2020:2024,
interval_lower = forecast_result$interval_lower,
interval_upper = forecast_result$interval_upper
),
aes(x = date, ymin = interval_lower, ymax = interval_upper),
inherit.aes = FALSE,
alpha = 0.2, fill = "blue"
) +
labs(
title =
"Age-Standardized Mortality Rate (ASMR) & Expected Levels [Germany]",
subtitle =
paste0(
"1y, 0-85+ · Std. Pop.: 2020 · Source: Statistical Office ",
"(destatis) · @USMortality"
),
caption = paste0(
"Based on a 3y average (2017-19), the forecast interval reflects the ",
"3y variance observed over 1970-2019. This provides an expected range\n",
"for future values."
),
y = "ASMR",
x = "Year"
) +
theme_bw()
ggplot2::ggsave(
filename = "chart2.png", plot = chart, width = width, height = height,
units = "px", dpi = 72 * sf, device = grDevices::png, type = c("cairo")
)
# source("/Users/ben/Downloads/2.r")
library(tidyverse)
library(scales)
library(zoo)
library(ggrepel)
sf <- 2
width <- 600 * sf
height <- 335 * sf
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))
parse_age_groups <- function(df) {
df$age_group <- sub("unter 1 Jahr", "0", df$age_group)
df$age_group <- sub("85 Jahre und mehr", "85+", df$age_group)
df$age_group <- sub("Insgesamt", "all", df$age_group)
df$age_group <- sub("-Jährige", "", df$age_group)
df$age_group <- sub("100 Jahre und mehr", "100+", df$age_group)
df
}
summarize_age_groups <- function(df, col, target_ag) {
generate_age_group <- function(age) {
# Return NA if input is NA
if (is.na(age)) {
return(NA_character_)
}
# Convert age to character and remove "+" if present
age_char <- as.character(age)
age_num <- as.numeric(sub("\\+", "", age_char))
# Check if conversion to numeric failed
if (is.na(age_num)) {
return(NA_character_)
}
# Iterate through age groups
for (range in target_ag) {
if (grepl("\\+$", range)) {
bound <- as.numeric(sub("\\+", "", range))
if (age_num >= bound) {
return(range)
}
} else if (grepl("-", range)) {
bounds <- as.numeric(unlist(strsplit(range, "-")))
if (age_num >= bounds[1] && age_num <= bounds[2]) {
return(range)
}
} else {
bound <- as.numeric(range)
if (age_num == bound) {
return(range)
}
}
}
# Return NA if no match found
return(NA_character_)
}
df |>
mutate(
age_group = case_when(
age_group == "all" ~ "all",
TRUE ~ sapply(age_group, generate_age_group)
)
) |>
group_by(date, age_group) |>
summarize({{ col }} := sum(.data[[col]], na.rm = TRUE), .groups = "drop") |>
filter(!is.na(age_group))
}
days_in_year <- function(year) {
start_date <- ymd(paste0(year, "-01-01"))
end_date <- ymd(paste0(year, "-12-31"))
as.integer(difftime(end_date, start_date, units = "days")) + 1
}
interpolate_daily <- function(data, col, ref_day) {
data <- data |>
mutate(date = as.Date(paste0(date, "-", ref_day))) |>
group_by(age_group) |>
complete(date = seq(min(date), max(date), by = "day"))
if ("population" %in% names(data)) {
data <- mutate(data, across({{ col }}, ~ na.approx(as.numeric(.),
rule = 2, na.rm = FALSE
)))
}
if ("deaths" %in% names(data)) {
data <- data |>
fill(deaths) |>
mutate(deaths = deaths / days_in_year(year(date)))
}
data
}
ag5 <- c("0-14", "15-64", "65-74", "75-84", "85+")
ag10y <- c(
"0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"
)
# Population
pop <- read_delim(
"https://apify.mortality.watch/destatis-genesis/12411-0005.csv.gz",
delim = ";", skip = 6
)
names(pop)[1] <- "age_group"
names(pop)[-1] <- sub(".*(.{4})$", "\\1", names(pop)[-1])
pop <- pop |>
filter(!is.na(`2000`)) |>
parse_age_groups()
pop <- pop |>
pivot_longer(
cols = !c("age_group"), names_to = "date", values_to = "population"
)
pop$date <- as.integer(pop$date)
pop$population <- as.integer(pop$population)
pop_5 <- pop |> summarize_age_groups("population", ag5)
pop_10y <- pop |> summarize_age_groups("population", ag10y)
pop_10y_daily <- pop_10y |> interpolate_daily("population", ref_day = "12-31")
pop_1y_daily <- pop |> interpolate_daily("population", ref_day = "12-31")
# Deaths
deaths <- read_delim(
"https://apify.mortality.watch/destatis-genesis/12613-0003.csv.gz",
delim = ";",
skip = 5,
col_types = cols(
.default = col_integer(),
`...1` = col_character(),
`...2` = col_character()
)
)
names(deaths)[1] <- "sex"
names(deaths)[2] <- "age_group"
deaths <- deaths |>
fill(sex, .direction = "down") |>
filter(sex == "Insgesamt") |>
pivot_longer(
cols = !c("sex", "age_group"), names_to = "date", values_to = "deaths"
) |>
mutate(date = as.integer(date)) |>
filter(!is.na(date)) |>
parse_age_groups() |>
summarize_age_groups("deaths", c(as.character(0:84), "85+"))
interpolate_with_future <- function(df) {
last <- df |> filter(date == max(date))
last$date <- as.character(as.numeric(last$date) + 1)
last$deaths <- NA
rbind(df, last) |>
interpolate_daily("deaths", "01-01") |>
filter(date != as.Date(paste0(unique(last$date), "-01-01")))
}
deaths_5 <- deaths |> summarize_age_groups("deaths", ag5)
deaths_10y <- deaths |> summarize_age_groups("deaths", ag10y)
deaths_10y_daily <- deaths_10y |> interpolate_with_future()
deaths_1y_daily <- deaths |> interpolate_with_future()
calc_asmr <- function(pop, deaths) {
# Combine Daily Deaths/Population
df <- pop |>
inner_join(deaths, by = join_by(age_group, date)) |>
group_by(date, age_group) |>
mutate(cmr = deaths / population * 100000) |>
mutate(date = ifelse(is.Date(date), year(date), date)) |>
group_by(date, age_group) |>
summarize(
population = mean(population), cmr = sum(cmr),
.groups = "drop"
) |>
filter(date > 1970)
# Std Pop 2020
std_pop <- pop |>
filter(date == ifelse(is.Date(date), as.Date("2020-01-01"), 2020)) |>
group_by(date, age_group) |>
summarize(population = sum(population), .groups = "drop") |>
mutate(weight = if_else(
age_group != "all",
population / population[age_group == "all"],
NA_real_
)) |>
filter(age_group != "all") |>
select(age_group, weight)
# ASMR
df |>
filter(date > 1990) |>
inner_join(std_pop, by = join_by(age_group)) |>
group_by(date) |>
summarize(asmr = sum(cmr * weight))
}
# 1) Simple year joins, by Mortality.org age groups
# (0-14, 15-64, 65-74, 75-84, 85+)
df1 <- calc_asmr(pop_5, deaths_5)
# 2) Simple year joins, by 10y age groups (0-10, .., 80+)
df2 <- calc_asmr(pop_10y, deaths_10y)
# 3) Daily year joins, by 10y age groups (0-10, .., 80+)
df3 <- calc_asmr(pop_10y_daily, deaths_10y_daily)
# 4) Simple year joins, by single age groups (0-85)
df4 <- calc_asmr(pop, deaths)
# 5) Daily year joins, by single age groups
df5 <- calc_asmr(pop_1y_daily, deaths_1y_daily)
asmr <- df1 |>
left_join(df2, by = "date") |>
left_join(df3, by = "date") |>
left_join(df4, by = "date") |>
left_join(df5, by = "date") |>
setNames(c("date", "ag5", "10y", "10y_daily", "1y", "1y_daily")) |>
pivot_longer(cols = 2:6, names_to = "type", values_to = "asmr")
df_plot <- asmr |> filter(date >= 2010)
# Plot
chart <-
df_plot |>
ggplot(aes(x = date, y = asmr, color = type, linetype = type)) +
geom_line() +
scale_color_manual(values = c(
"ag5" = "blue",
"1y_daily" = "black",
"1y" = "black",
"10y" = "red",
"10y_daily" = "red"
)) +
scale_linetype_manual(values = c(
"ag5" = "solid",
"1y_daily" = "solid",
"1y" = "dashed",
"10y" = "dashed",
"10y_daily" = "solid"
)) +
geom_text_repel(
data = subset(df_plot, type == "10y_daily"),
aes(
label = round(asmr),
vjust = ifelse(date %% 2 == 0, -1, 2)
),
size = 3
) +
labs(
title = paste0(
"Age-Standardized Mortality Rate (ASMR) by Calculation ",
"Strategy [Germany]"
),
subtitle = paste0(
"Std. Pop.: 2020 · Source: Statistical Office ",
"Germany (destatis) · @USMortality"
),
x = "Year",
y = "ASMR"
) +
scale_y_continuous(labels = comma) +
theme_bw()
ggplot2::ggsave(
filename = "chart3.png", plot = chart, width = width, height = height,
units = "px", dpi = 72 * sf, device = grDevices::png, type = c("cairo")
)
# source("/Users/ben/Downloads/1.r")
@USMortality
Copy link
Author

chart1 chart2 chart3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment