Skip to content

Instantly share code, notes, and snippets.

@USMortality
Last active December 6, 2024 10:08
Show Gist options
  • Save USMortality/15c277a92a7a29152022cd66e8b56d78 to your computer and use it in GitHub Desktop.
Save USMortality/15c277a92a7a29152022cd66e8b56d78 to your computer and use it in GitHub Desktop.
Life Expectancy at Birth [USA, 2022]
library(tidyverse)
library(demography)
library(scales)
sf <- 2
width <- 600 * sf
height <- 335 * sf
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))
### Deaths
deaths <- read_csv(paste0(
"https://gist.githubusercontent.com/USMortality/",
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
"data_usa_deaths_single_years.csv"
), col_types = "cciiiciiciiiccci") |>
select(Age, Deaths) |>
setNames(c("age", "deaths")) |>
filter(!is.na(age)) |>
group_by(age) |>
summarize(deaths = sum(deaths))
### Population
pop <- read_csv(paste0(
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
"national/asrh/nc-est2023-agesex-res.csv"
), col_types = "iiiiiii") |>
filter(SEX == 0, AGE %in% 0:100) |>
select(AGE, POPESTIMATE2022) |>
setNames(c("age", "population")) |>
bind_rows(tibble(age = 101:125, population = NA))
# Interpolate 100+ population to 100-125
w <- exp(-0.66 * 0:25)
pop$w <- c(rep(NA, 100), w / sum(w))
remainder <- pop$population[pop$age == 100]
pop$population[pop$age == 100] <- NA
pop <- pop |>
mutate(population = ifelse(!is.na(population),
population, ceiling(w * remainder)
)) |>
select(-w)
### Combined Dataset
df <- deaths |>
right_join(pop, by = join_by(age)) |>
mutate(across(everything(), as.integer),
deaths = ifelse(is.na(deaths), 0, deaths)
) |>
arrange(age) |>
filter(row_number() <= max(which(deaths > 0)) | deaths > 0)
### 1) Single ages, 0-125
usa_demog <- demogdata(
data = matrix(df$deaths / df$population, ncol = 1),
pop = matrix(df$population, ncol = 1),
age = matrix(df$age, ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
result_1 <- usa_life_table$ex[1]
### 2) Single ages, 0-85+
df2 <- df |>
mutate(age = ifelse(age >= 85, "85+", as.character(age))) |>
group_by(age) |>
summarize(
deaths = sum(deaths),
population = sum(population), .groups = "drop"
)
# Evenly split 85+ to 85-100
ages <- 85:100
df2 <- bind_rows(
df2, tibble(
age = as.character(ages),
deaths = rep(df2$deaths[df2$age == "85+"] / length(ages), length(ages)),
population =
rep(df2$population[df2$age == "85+"] / length(ages), length(ages))
)
) |>
filter(age != "85+") |>
mutate(age = as.integer(age)) |>
dplyr::arrange(age)
usa_demog <- demogdata(
data = matrix(df2$deaths / df2$population, ncol = 1),
pop = matrix(df2$population, ncol = 1),
age = matrix(df2$age, ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
result_2a <- usa_life_table$ex[1]
usa_demog <- demogdata(
data = matrix(df2$deaths / df$population[0:101], ncol = 1),
pop = matrix(df$population[0:101], ncol = 1),
age = matrix(df$age[0:101], ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
result_2b <- usa_life_table$ex[1]
### 3) 5y age groups, 0-85+
df3 <- df |>
mutate(
age = case_when(
age < 5 ~ "0-4",
age < 10 ~ "5-9",
age < 15 ~ "10-14",
age < 20 ~ "15-19",
age < 25 ~ "20-24",
age < 30 ~ "25-29",
age < 35 ~ "30-34",
age < 40 ~ "35-39",
age < 45 ~ "40-44",
age < 50 ~ "45-49",
age < 55 ~ "50-54",
age < 60 ~ "55-59",
age < 65 ~ "60-64",
age < 70 ~ "65-69",
age < 75 ~ "70-74",
age < 80 ~ "75-79",
age < 85 ~ "80-84",
TRUE ~ "85+"
)
) |>
group_by(age) |>
summarize(
deaths = sum(deaths), population = sum(population), .groups = "drop"
)
# Function to disaggregate a single row into individual years
disaggregate_row <- function(age_range, deaths, population) {
if (age_range == "85+") {
years <- 85:100 # Define the single-year range for 85+
} else {
years <- as.numeric(
unlist(strsplit(age_range, "-"))[1]
):as.numeric(unlist(strsplit(age_range, "-"))[2])
}
tibble(
age = years,
deaths = rep(deaths / length(years), length(years)),
population = rep(population / length(years), length(years))
)
}
# Apply the disaggregation to all rows
df3 <- df3 %>%
rowwise() %>%
do(disaggregate_row(.$age, .$deaths, .$population)) %>%
ungroup()
usa_demog <- demogdata(
data = matrix(df3$deaths / df3$population, ncol = 1),
pop = matrix(df3$population, ncol = 1),
age = matrix(df3$age, ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
result_3a <- usa_life_table$ex[1]
usa_demog <- demogdata(
data = matrix(df3$deaths / df$population[0:101], ncol = 1),
pop = matrix(df$population[0:101], ncol = 1),
age = matrix(df$age[0:101], ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
result_3b <- usa_life_table$ex[1]
# Plot
chart <- bind_rows(
df |> mutate(source = "Actuals"),
df2 |> mutate(source = "0-85+, single years"),
df3 |> mutate(source = "0-85+, 5y age groups")
) |>
pivot_longer(cols = -c(age, source)) |>
ggplot(aes(x = age, y = value, color = source)) +
geom_line() +
scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
labs(
title = "Deaths & Populations by single age groups [USA, Year=2022]",
x = "Age",
y = "",
color = "Source"
) +
theme(legend.position = "top") +
facet_wrap(vars(name), scales = "free")
ggsave(
filename = "chart1.png",
plot = chart,
width = width,
height = height,
units = "px",
dpi = 72 * sf,
device = grDevices::png,
type = "cairo"
)
# Compare results:
cat(
sprintf("1) 0-125, single year: %.2f\n", result_1),
sprintf(
"2a) 0-85+, single year: %.2f (diff: %.2f%%)\n",
result_2a, ((result_2a - result_1) / result_1) * 100
),
sprintf(
"2b) 0-85+, single year (pop. 0-121): %.2f (diff: %.2f%%)\n",
result_2b, ((result_2b - result_1) / result_1) * 100
),
sprintf(
"3a) 0-85+, 5y age groups: %.2f (diff: %.2f%%)\n",
result_3a, ((result_3a - result_1) / result_1) * 100
),
sprintf(
"3b) 0-85+, 5y age groups (populations 0-121): %.2f (diff: %.2f%%)\n",
result_3b, ((result_3b - result_1) / result_1) * 100
)
)
library(tidyverse)
sf <- 2
width <- 600 * sf
height <- 335 * sf
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))
### Deaths
deaths <- read_csv(paste0(
"https://gist.githubusercontent.com/USMortality/",
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
"data_usa_deaths_single_years.csv"
), col_types = "cciiiciiciiiccci") |>
select(Age, Deaths) |>
setNames(c("age", "deaths")) |>
filter(!is.na(age)) |>
group_by(age) |>
summarize(deaths = sum(deaths))
### Population
pop <- read_csv(paste0(
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
"national/asrh/nc-est2023-agesex-res.csv"
), col_types = "iiiiiii") |>
filter(SEX == 0, AGE %in% 0:100) |>
select(AGE, POPESTIMATE2022) |>
setNames(c("age", "population")) |>
bind_rows(tibble(age = 101:125, population = NA))
# Interpolate 100+ population to 100-125
w <- exp(-0.66 * 0:25)
pop$w <- c(rep(NA, 100), w / sum(w))
remainder <- pop$population[pop$age == 100]
pop$population[pop$age == 100] <- NA
pop <- pop |>
mutate(population = ifelse(!is.na(population),
population, ceiling(w * remainder)
)) |>
select(-w)
### Combined Dataset
df <- deaths |>
right_join(pop, by = join_by(age)) |>
mutate(across(everything(), as.integer),
deaths = ifelse(is.na(deaths), 0, deaths),
cmr = deaths / population * 100000
) |>
arrange(age) |>
filter(row_number() <= max(which(deaths > 0)) | deaths > 0)
# Plot
quartz(width = 6, height = 3)
df |>
filter(age > 70) |>
pivot_longer(!1) |>
ggplot(aes(x = age, y = value)) +
geom_line() +
scale_y_log10() +
labs(title = "Crude Mortality by Single Years of Age [USA, Year=2022]") +
facet_wrap(vars(name))
### Calculate LE
# Method: Hyndman et al.
library(demography)
usa_demog <- demogdata(
data = matrix(df$deaths / df$population, ncol = 1),
pop = matrix(df$population, ncol = 1),
age = matrix(df$age, ncol = 1),
label = "USA",
type = "mortality",
name = "total",
years = 2022
)
usa_life_table <- lifetable(usa_demog)
# LE at Birth:
result_1 <- usa_life_table$ex[1]
# Method: Ulf Lorré
ages <- df$age + 0.5
mx <- df$deaths / df$population
qx <- mx / (1 + mx / 2)
p <- rep(1, length(ages))
d <- rep(0, length(ages))
for (a in 1:(length(ages) - 1)) {
d[a] <- p[a] * qx[a]
p[a + 1] <- max(0, p[a] - d[a])
}
d[length(ages)] <- p[length(ages)] * qx[length(ages)]
result_2 <- weighted.mean(ages, d)
# Method Life Table
life_table <- df |>
mutate(
# Mortality hazard rate (average rate of dying during the interval)
mx = deaths / population,
# Crude probability of dying (approximation, assumes low mortality rates)
# qx = mx,
# Adjusted probability of dying (accounts for timing of deaths during the interval)
qx = mx / (1 + mx / 2),
px = 1 - qx, # Survival probability
# Survivors at start of age interval (assume l0 = 100,000)
lx = lag(cumprod(px), default = 1) * 100000,
dx = lx * qx, # Number of deaths in the interval
Lx = lx - 0.5 * dx, # Person-years lived in the interval
Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x
ex = Tx / lx # Life expectancy at age x
)
result_3 <- life_table[1, ]$ex
### Henjin
age <- df$age
mx <- df$cmr / 100000
# Probability of dying (adjusted for exposure time)
qx <- mx / (1 + mx / 2)
qx <- pmin(qx, 1) # Ensure qx does not exceed 1
px <- 1 - qx # Probability of surviving
# Survivors at start of each age group
lx <- numeric(length(age))
lx[1] <- 1e5 # Starting population (arbitrary base of 100,000)
for (i in 2:length(age)) {
lx[i] <- lx[i - 1] * px[i - 1]
}
# Number of deaths in each age group
dx <- lx * qx
# Person-years lived in each age group
Lx <- lx - 0.5 * dx
# Adjust last age group's person-years lived
if (mx[length(mx)] > 0) {
Lx[length(Lx)] <- lx[length(Lx)] / mx[length(mx)]
} else {
Lx[length(Lx)] <- lx[length(Lx)] # If mx is zero, assume no deaths
}
# Cumulative person-years remaining beyond each age
Tx <- numeric(length(age))
Tx[length(Tx)] <- Lx[length(Lx)]
for (i in (length(age) - 1):1) {
Tx[i] <- Tx[i + 1] + Lx[i]
}
# Life expectancy at each age
ex <- Tx / lx
# Return the life table as a data frame
life_table <- data.frame(age, mx, qx, px, lx, dx, Lx, Tx, ex)
result_4 <- life_table[1, ]$ex
cat(
sprintf("Hyndman: %f\n", result_1),
sprintf("Ulf Lorré: %f\n", result_2),
sprintf("ChatGPT (Ben): %f\n", result_3),
sprintf("ChatGPT (Henjin): %f\n", result_4)
)
library(splines)
library(tidyverse)
library(demography)
library(scales)
sf <- 2
width <- 600 * sf
height <- 335 * sf
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))
### Deaths
deaths <- read_csv(paste0(
"https://gist.githubusercontent.com/USMortality/",
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
"data_usa_deaths_single_years.csv"
), col_types = "cciiiciiciiiccci") |>
select(Age, Deaths) |>
setNames(c("age", "deaths")) |>
filter(!is.na(age)) |>
group_by(age) |>
summarize(deaths = sum(deaths))
### Population
pop <- read_csv(paste0(
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
"national/asrh/nc-est2023-agesex-res.csv"
), col_types = "iiiiiii") |>
filter(SEX == 0, AGE %in% 0:100) |>
select(AGE, POPESTIMATE2022) |>
setNames(c("age", "population")) |>
bind_rows(tibble(age = 101:125, population = NA))
# Interpolate 100+ population to 100-125
w <- exp(-0.66 * 0:25)
pop$w <- c(rep(NA, 100), w / sum(w))
remainder <- pop$population[pop$age == 100]
pop$population[pop$age == 100] <- NA
pop <- pop |>
mutate(population = ifelse(!is.na(population),
population, ceiling(w * remainder)
)) |>
select(-w)
### Combined Dataset
df <- deaths |>
right_join(pop, by = join_by(age)) |>
mutate(across(everything(), as.integer),
deaths = ifelse(is.na(deaths), 0, deaths)
) |>
arrange(age) |>
filter(row_number() <= max(which(deaths > 0)) | deaths > 0) |>
mutate(cmr = deaths / population * 100000)
### Functions
# Aggregate single age data into age groups
aggregate_age_groups <- function(df, age_groups) {
df |>
mutate(age = case_when(!!!age_groups)) |>
group_by(age) |>
summarize(
deaths = sum(deaths, na.rm = TRUE),
population = sum(population, na.rm = TRUE),
.groups = "drop"
) |>
mutate(cmr = deaths / population * 100000) |>
select(age, cmr)
}
# Function to disaggregate a single row into individual years
disaggregate_row_lin <- function(age_range, cmr) {
if (grepl("\\+", age_range)) { # Check if age_range includes a plus sign
start_age <- as.numeric(sub("\\+.*", "", age_range))
years <- start_age:100
} else {
years <- as.numeric(
unlist(strsplit(age_range, "-"))[1]
):as.numeric(unlist(strsplit(age_range, "-"))[2])
}
tibble(age = years, cmr = rep(cmr, length(years)))
}
disaggregate_lin <- function(df) {
df |>
rowwise() |>
do(disaggregate_row_lin(.$age, .$cmr)) |>
ungroup()
}
get_lifetable <- function(df) {
df |>
mutate(
# Mortality hazard rate (average rate of dying during the interval)
mx = cmr / 100000,
# Adjusted probability of dying (accounts for timing of deaths during the interval)
qx = mx / (1 + mx / 2),
px = 1 - qx, # Survival probability
# Survivors at start of age interval (assume l0 = 100,000)
lx = lag(cumprod(px), default = 1) * 100000,
dx = lx * qx, # Number of deaths in the interval
Lx = lx - 0.5 * dx, # Person-years lived in the interval
Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x
ex = Tx / lx # Life expectancy at age x
)
}
# Age group definitions
age_groups_5y <- rlang::exprs(
age < 5 ~ "00-04",
age < 10 ~ "05-09",
age < 15 ~ "10-14",
age < 20 ~ "15-19",
age < 25 ~ "20-24",
age < 30 ~ "25-29",
age < 35 ~ "30-34",
age < 40 ~ "35-39",
age < 45 ~ "40-44",
age < 50 ~ "45-49",
age < 55 ~ "50-54",
age < 60 ~ "55-59",
age < 65 ~ "60-64",
age < 70 ~ "65-69",
age < 75 ~ "70-74",
age < 80 ~ "75-79",
age < 85 ~ "80-84",
TRUE ~ "85+"
)
age_groups_10y <- rlang::exprs(
age < 10 ~ "00-09",
age < 20 ~ "10-19",
age < 30 ~ "20-29",
age < 40 ~ "30-39",
age < 50 ~ "40-49",
age < 60 ~ "50-59",
age < 70 ~ "60-69",
age < 80 ~ "70-79",
TRUE ~ "80+"
)
age_groups_20y <- rlang::exprs(
age < 20 ~ "00-19",
age < 40 ~ "20-39",
age < 60 ~ "40-59",
age < 80 ~ "60-79",
TRUE ~ "80+"
)
age_groups_mortality_org <- rlang::exprs(
age < 15 ~ "00-14",
age < 65 ~ "15-64",
age < 75 ~ "65-74",
age < 85 ~ "75-84",
TRUE ~ "85+"
)
# Perform aggregations
df5y <- aggregate_age_groups(df, age_groups_5y)
# df10y <- aggregate_age_groups(df, age_groups_10y)
# df20y <- aggregate_age_groups(df, age_groups_20y)
# df_mortality_org <- aggregate_age_groups(df, age_groups_mortality_org)
# Use the midpoints of the age groups
data <- df5y %>%
mutate(midpoint = case_when(
age == "85+" ~ 90, # Manually assign the midpoint for "80+"
TRUE ~ rowMeans(strsplit(age, "-") %>% lapply(as.numeric) %>% do.call(rbind, .))
))
# Fit a spline model
spline_model <- smooth.spline(data$midpoint, data$cmr)
# Predict for single ages from 0 to 100
single_ages <- seq(0, 100, by = 1)
predicted_cmr <- predict(spline_model, single_ages, deriv = 0)$y
# Ensure predictions are non-negative (if needed)
predicted_cmr[predicted_cmr < 0] <- 0
dfx <- tibble(age = single_ages, cmr = predicted_cmr)
bind_rows(
df |> mutate(source = "Actuals") |> select(age, cmr, source),
df5y |> disaggregate_lin() |> mutate(source = "0-85+, 5y"),
dfx |> mutate(source = "Interpolated (Spline)")
) |>
pivot_longer(cols = -c(age, source)) |>
ggplot(aes(x = age, y = value, color = source)) +
geom_line() +
scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
labs(
title = "Crude Mortality by single year age groups [USA, Year=2022]",
x = "Age",
y = "Crude Mortality Rate per 100k (CMR)",
color = "Source"
) +
theme(legend.position = "top") +
facet_wrap(vars(name), scales = "free")
a <- df |> get_lifetable()
b <- dfx |> get_lifetable()
cat(
sprintf("1) 0-125, single year: %.2f\n", a[1, ]$ex),
sprintf(
"2) Spline Interpolated; 0-85+, 5y: %.2f (diff: %.2f%%)\n",
b[1, ]$ex, ((b[1, ]$ex - a[1, ]$ex) / a[1, ]$ex) * 100
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment