Last active
December 6, 2024 10:08
-
-
Save USMortality/15c277a92a7a29152022cd66e8b56d78 to your computer and use it in GitHub Desktop.
Life Expectancy at Birth [USA, 2022]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(demography) | |
library(scales) | |
sf <- 2 | |
width <- 600 * sf | |
height <- 335 * sf | |
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf)) | |
### Deaths | |
deaths <- read_csv(paste0( | |
"https://gist.githubusercontent.com/USMortality/", | |
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/", | |
"data_usa_deaths_single_years.csv" | |
), col_types = "cciiiciiciiiccci") |> | |
select(Age, Deaths) |> | |
setNames(c("age", "deaths")) |> | |
filter(!is.na(age)) |> | |
group_by(age) |> | |
summarize(deaths = sum(deaths)) | |
### Population | |
pop <- read_csv(paste0( | |
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/", | |
"national/asrh/nc-est2023-agesex-res.csv" | |
), col_types = "iiiiiii") |> | |
filter(SEX == 0, AGE %in% 0:100) |> | |
select(AGE, POPESTIMATE2022) |> | |
setNames(c("age", "population")) |> | |
bind_rows(tibble(age = 101:125, population = NA)) | |
# Interpolate 100+ population to 100-125 | |
w <- exp(-0.66 * 0:25) | |
pop$w <- c(rep(NA, 100), w / sum(w)) | |
remainder <- pop$population[pop$age == 100] | |
pop$population[pop$age == 100] <- NA | |
pop <- pop |> | |
mutate(population = ifelse(!is.na(population), | |
population, ceiling(w * remainder) | |
)) |> | |
select(-w) | |
### Combined Dataset | |
df <- deaths |> | |
right_join(pop, by = join_by(age)) |> | |
mutate(across(everything(), as.integer), | |
deaths = ifelse(is.na(deaths), 0, deaths) | |
) |> | |
arrange(age) |> | |
filter(row_number() <= max(which(deaths > 0)) | deaths > 0) | |
### 1) Single ages, 0-125 | |
usa_demog <- demogdata( | |
data = matrix(df$deaths / df$population, ncol = 1), | |
pop = matrix(df$population, ncol = 1), | |
age = matrix(df$age, ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
result_1 <- usa_life_table$ex[1] | |
### 2) Single ages, 0-85+ | |
df2 <- df |> | |
mutate(age = ifelse(age >= 85, "85+", as.character(age))) |> | |
group_by(age) |> | |
summarize( | |
deaths = sum(deaths), | |
population = sum(population), .groups = "drop" | |
) | |
# Evenly split 85+ to 85-100 | |
ages <- 85:100 | |
df2 <- bind_rows( | |
df2, tibble( | |
age = as.character(ages), | |
deaths = rep(df2$deaths[df2$age == "85+"] / length(ages), length(ages)), | |
population = | |
rep(df2$population[df2$age == "85+"] / length(ages), length(ages)) | |
) | |
) |> | |
filter(age != "85+") |> | |
mutate(age = as.integer(age)) |> | |
dplyr::arrange(age) | |
usa_demog <- demogdata( | |
data = matrix(df2$deaths / df2$population, ncol = 1), | |
pop = matrix(df2$population, ncol = 1), | |
age = matrix(df2$age, ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
result_2a <- usa_life_table$ex[1] | |
usa_demog <- demogdata( | |
data = matrix(df2$deaths / df$population[0:101], ncol = 1), | |
pop = matrix(df$population[0:101], ncol = 1), | |
age = matrix(df$age[0:101], ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
result_2b <- usa_life_table$ex[1] | |
### 3) 5y age groups, 0-85+ | |
df3 <- df |> | |
mutate( | |
age = case_when( | |
age < 5 ~ "0-4", | |
age < 10 ~ "5-9", | |
age < 15 ~ "10-14", | |
age < 20 ~ "15-19", | |
age < 25 ~ "20-24", | |
age < 30 ~ "25-29", | |
age < 35 ~ "30-34", | |
age < 40 ~ "35-39", | |
age < 45 ~ "40-44", | |
age < 50 ~ "45-49", | |
age < 55 ~ "50-54", | |
age < 60 ~ "55-59", | |
age < 65 ~ "60-64", | |
age < 70 ~ "65-69", | |
age < 75 ~ "70-74", | |
age < 80 ~ "75-79", | |
age < 85 ~ "80-84", | |
TRUE ~ "85+" | |
) | |
) |> | |
group_by(age) |> | |
summarize( | |
deaths = sum(deaths), population = sum(population), .groups = "drop" | |
) | |
# Function to disaggregate a single row into individual years | |
disaggregate_row <- function(age_range, deaths, population) { | |
if (age_range == "85+") { | |
years <- 85:100 # Define the single-year range for 85+ | |
} else { | |
years <- as.numeric( | |
unlist(strsplit(age_range, "-"))[1] | |
):as.numeric(unlist(strsplit(age_range, "-"))[2]) | |
} | |
tibble( | |
age = years, | |
deaths = rep(deaths / length(years), length(years)), | |
population = rep(population / length(years), length(years)) | |
) | |
} | |
# Apply the disaggregation to all rows | |
df3 <- df3 %>% | |
rowwise() %>% | |
do(disaggregate_row(.$age, .$deaths, .$population)) %>% | |
ungroup() | |
usa_demog <- demogdata( | |
data = matrix(df3$deaths / df3$population, ncol = 1), | |
pop = matrix(df3$population, ncol = 1), | |
age = matrix(df3$age, ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
result_3a <- usa_life_table$ex[1] | |
usa_demog <- demogdata( | |
data = matrix(df3$deaths / df$population[0:101], ncol = 1), | |
pop = matrix(df$population[0:101], ncol = 1), | |
age = matrix(df$age[0:101], ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
result_3b <- usa_life_table$ex[1] | |
# Plot | |
chart <- bind_rows( | |
df |> mutate(source = "Actuals"), | |
df2 |> mutate(source = "0-85+, single years"), | |
df3 |> mutate(source = "0-85+, 5y age groups") | |
) |> | |
pivot_longer(cols = -c(age, source)) |> | |
ggplot(aes(x = age, y = value, color = source)) + | |
geom_line() + | |
scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) + | |
labs( | |
title = "Deaths & Populations by single age groups [USA, Year=2022]", | |
x = "Age", | |
y = "", | |
color = "Source" | |
) + | |
theme(legend.position = "top") + | |
facet_wrap(vars(name), scales = "free") | |
ggsave( | |
filename = "chart1.png", | |
plot = chart, | |
width = width, | |
height = height, | |
units = "px", | |
dpi = 72 * sf, | |
device = grDevices::png, | |
type = "cairo" | |
) | |
# Compare results: | |
cat( | |
sprintf("1) 0-125, single year: %.2f\n", result_1), | |
sprintf( | |
"2a) 0-85+, single year: %.2f (diff: %.2f%%)\n", | |
result_2a, ((result_2a - result_1) / result_1) * 100 | |
), | |
sprintf( | |
"2b) 0-85+, single year (pop. 0-121): %.2f (diff: %.2f%%)\n", | |
result_2b, ((result_2b - result_1) / result_1) * 100 | |
), | |
sprintf( | |
"3a) 0-85+, 5y age groups: %.2f (diff: %.2f%%)\n", | |
result_3a, ((result_3a - result_1) / result_1) * 100 | |
), | |
sprintf( | |
"3b) 0-85+, 5y age groups (populations 0-121): %.2f (diff: %.2f%%)\n", | |
result_3b, ((result_3b - result_1) / result_1) * 100 | |
) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
sf <- 2 | |
width <- 600 * sf | |
height <- 335 * sf | |
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf)) | |
### Deaths | |
deaths <- read_csv(paste0( | |
"https://gist.githubusercontent.com/USMortality/", | |
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/", | |
"data_usa_deaths_single_years.csv" | |
), col_types = "cciiiciiciiiccci") |> | |
select(Age, Deaths) |> | |
setNames(c("age", "deaths")) |> | |
filter(!is.na(age)) |> | |
group_by(age) |> | |
summarize(deaths = sum(deaths)) | |
### Population | |
pop <- read_csv(paste0( | |
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/", | |
"national/asrh/nc-est2023-agesex-res.csv" | |
), col_types = "iiiiiii") |> | |
filter(SEX == 0, AGE %in% 0:100) |> | |
select(AGE, POPESTIMATE2022) |> | |
setNames(c("age", "population")) |> | |
bind_rows(tibble(age = 101:125, population = NA)) | |
# Interpolate 100+ population to 100-125 | |
w <- exp(-0.66 * 0:25) | |
pop$w <- c(rep(NA, 100), w / sum(w)) | |
remainder <- pop$population[pop$age == 100] | |
pop$population[pop$age == 100] <- NA | |
pop <- pop |> | |
mutate(population = ifelse(!is.na(population), | |
population, ceiling(w * remainder) | |
)) |> | |
select(-w) | |
### Combined Dataset | |
df <- deaths |> | |
right_join(pop, by = join_by(age)) |> | |
mutate(across(everything(), as.integer), | |
deaths = ifelse(is.na(deaths), 0, deaths), | |
cmr = deaths / population * 100000 | |
) |> | |
arrange(age) |> | |
filter(row_number() <= max(which(deaths > 0)) | deaths > 0) | |
# Plot | |
quartz(width = 6, height = 3) | |
df |> | |
filter(age > 70) |> | |
pivot_longer(!1) |> | |
ggplot(aes(x = age, y = value)) + | |
geom_line() + | |
scale_y_log10() + | |
labs(title = "Crude Mortality by Single Years of Age [USA, Year=2022]") + | |
facet_wrap(vars(name)) | |
### Calculate LE | |
# Method: Hyndman et al. | |
library(demography) | |
usa_demog <- demogdata( | |
data = matrix(df$deaths / df$population, ncol = 1), | |
pop = matrix(df$population, ncol = 1), | |
age = matrix(df$age, ncol = 1), | |
label = "USA", | |
type = "mortality", | |
name = "total", | |
years = 2022 | |
) | |
usa_life_table <- lifetable(usa_demog) | |
# LE at Birth: | |
result_1 <- usa_life_table$ex[1] | |
# Method: Ulf Lorré | |
ages <- df$age + 0.5 | |
mx <- df$deaths / df$population | |
qx <- mx / (1 + mx / 2) | |
p <- rep(1, length(ages)) | |
d <- rep(0, length(ages)) | |
for (a in 1:(length(ages) - 1)) { | |
d[a] <- p[a] * qx[a] | |
p[a + 1] <- max(0, p[a] - d[a]) | |
} | |
d[length(ages)] <- p[length(ages)] * qx[length(ages)] | |
result_2 <- weighted.mean(ages, d) | |
# Method Life Table | |
life_table <- df |> | |
mutate( | |
# Mortality hazard rate (average rate of dying during the interval) | |
mx = deaths / population, | |
# Crude probability of dying (approximation, assumes low mortality rates) | |
# qx = mx, | |
# Adjusted probability of dying (accounts for timing of deaths during the interval) | |
qx = mx / (1 + mx / 2), | |
px = 1 - qx, # Survival probability | |
# Survivors at start of age interval (assume l0 = 100,000) | |
lx = lag(cumprod(px), default = 1) * 100000, | |
dx = lx * qx, # Number of deaths in the interval | |
Lx = lx - 0.5 * dx, # Person-years lived in the interval | |
Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x | |
ex = Tx / lx # Life expectancy at age x | |
) | |
result_3 <- life_table[1, ]$ex | |
### Henjin | |
age <- df$age | |
mx <- df$cmr / 100000 | |
# Probability of dying (adjusted for exposure time) | |
qx <- mx / (1 + mx / 2) | |
qx <- pmin(qx, 1) # Ensure qx does not exceed 1 | |
px <- 1 - qx # Probability of surviving | |
# Survivors at start of each age group | |
lx <- numeric(length(age)) | |
lx[1] <- 1e5 # Starting population (arbitrary base of 100,000) | |
for (i in 2:length(age)) { | |
lx[i] <- lx[i - 1] * px[i - 1] | |
} | |
# Number of deaths in each age group | |
dx <- lx * qx | |
# Person-years lived in each age group | |
Lx <- lx - 0.5 * dx | |
# Adjust last age group's person-years lived | |
if (mx[length(mx)] > 0) { | |
Lx[length(Lx)] <- lx[length(Lx)] / mx[length(mx)] | |
} else { | |
Lx[length(Lx)] <- lx[length(Lx)] # If mx is zero, assume no deaths | |
} | |
# Cumulative person-years remaining beyond each age | |
Tx <- numeric(length(age)) | |
Tx[length(Tx)] <- Lx[length(Lx)] | |
for (i in (length(age) - 1):1) { | |
Tx[i] <- Tx[i + 1] + Lx[i] | |
} | |
# Life expectancy at each age | |
ex <- Tx / lx | |
# Return the life table as a data frame | |
life_table <- data.frame(age, mx, qx, px, lx, dx, Lx, Tx, ex) | |
result_4 <- life_table[1, ]$ex | |
cat( | |
sprintf("Hyndman: %f\n", result_1), | |
sprintf("Ulf Lorré: %f\n", result_2), | |
sprintf("ChatGPT (Ben): %f\n", result_3), | |
sprintf("ChatGPT (Henjin): %f\n", result_4) | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(splines) | |
library(tidyverse) | |
library(demography) | |
library(scales) | |
sf <- 2 | |
width <- 600 * sf | |
height <- 335 * sf | |
options(vsc.dev.args = list(width = width, height = height, res = 72 * sf)) | |
### Deaths | |
deaths <- read_csv(paste0( | |
"https://gist.githubusercontent.com/USMortality/", | |
"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/", | |
"data_usa_deaths_single_years.csv" | |
), col_types = "cciiiciiciiiccci") |> | |
select(Age, Deaths) |> | |
setNames(c("age", "deaths")) |> | |
filter(!is.na(age)) |> | |
group_by(age) |> | |
summarize(deaths = sum(deaths)) | |
### Population | |
pop <- read_csv(paste0( | |
"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/", | |
"national/asrh/nc-est2023-agesex-res.csv" | |
), col_types = "iiiiiii") |> | |
filter(SEX == 0, AGE %in% 0:100) |> | |
select(AGE, POPESTIMATE2022) |> | |
setNames(c("age", "population")) |> | |
bind_rows(tibble(age = 101:125, population = NA)) | |
# Interpolate 100+ population to 100-125 | |
w <- exp(-0.66 * 0:25) | |
pop$w <- c(rep(NA, 100), w / sum(w)) | |
remainder <- pop$population[pop$age == 100] | |
pop$population[pop$age == 100] <- NA | |
pop <- pop |> | |
mutate(population = ifelse(!is.na(population), | |
population, ceiling(w * remainder) | |
)) |> | |
select(-w) | |
### Combined Dataset | |
df <- deaths |> | |
right_join(pop, by = join_by(age)) |> | |
mutate(across(everything(), as.integer), | |
deaths = ifelse(is.na(deaths), 0, deaths) | |
) |> | |
arrange(age) |> | |
filter(row_number() <= max(which(deaths > 0)) | deaths > 0) |> | |
mutate(cmr = deaths / population * 100000) | |
### Functions | |
# Aggregate single age data into age groups | |
aggregate_age_groups <- function(df, age_groups) { | |
df |> | |
mutate(age = case_when(!!!age_groups)) |> | |
group_by(age) |> | |
summarize( | |
deaths = sum(deaths, na.rm = TRUE), | |
population = sum(population, na.rm = TRUE), | |
.groups = "drop" | |
) |> | |
mutate(cmr = deaths / population * 100000) |> | |
select(age, cmr) | |
} | |
# Function to disaggregate a single row into individual years | |
disaggregate_row_lin <- function(age_range, cmr) { | |
if (grepl("\\+", age_range)) { # Check if age_range includes a plus sign | |
start_age <- as.numeric(sub("\\+.*", "", age_range)) | |
years <- start_age:100 | |
} else { | |
years <- as.numeric( | |
unlist(strsplit(age_range, "-"))[1] | |
):as.numeric(unlist(strsplit(age_range, "-"))[2]) | |
} | |
tibble(age = years, cmr = rep(cmr, length(years))) | |
} | |
disaggregate_lin <- function(df) { | |
df |> | |
rowwise() |> | |
do(disaggregate_row_lin(.$age, .$cmr)) |> | |
ungroup() | |
} | |
get_lifetable <- function(df) { | |
df |> | |
mutate( | |
# Mortality hazard rate (average rate of dying during the interval) | |
mx = cmr / 100000, | |
# Adjusted probability of dying (accounts for timing of deaths during the interval) | |
qx = mx / (1 + mx / 2), | |
px = 1 - qx, # Survival probability | |
# Survivors at start of age interval (assume l0 = 100,000) | |
lx = lag(cumprod(px), default = 1) * 100000, | |
dx = lx * qx, # Number of deaths in the interval | |
Lx = lx - 0.5 * dx, # Person-years lived in the interval | |
Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x | |
ex = Tx / lx # Life expectancy at age x | |
) | |
} | |
# Age group definitions | |
age_groups_5y <- rlang::exprs( | |
age < 5 ~ "00-04", | |
age < 10 ~ "05-09", | |
age < 15 ~ "10-14", | |
age < 20 ~ "15-19", | |
age < 25 ~ "20-24", | |
age < 30 ~ "25-29", | |
age < 35 ~ "30-34", | |
age < 40 ~ "35-39", | |
age < 45 ~ "40-44", | |
age < 50 ~ "45-49", | |
age < 55 ~ "50-54", | |
age < 60 ~ "55-59", | |
age < 65 ~ "60-64", | |
age < 70 ~ "65-69", | |
age < 75 ~ "70-74", | |
age < 80 ~ "75-79", | |
age < 85 ~ "80-84", | |
TRUE ~ "85+" | |
) | |
age_groups_10y <- rlang::exprs( | |
age < 10 ~ "00-09", | |
age < 20 ~ "10-19", | |
age < 30 ~ "20-29", | |
age < 40 ~ "30-39", | |
age < 50 ~ "40-49", | |
age < 60 ~ "50-59", | |
age < 70 ~ "60-69", | |
age < 80 ~ "70-79", | |
TRUE ~ "80+" | |
) | |
age_groups_20y <- rlang::exprs( | |
age < 20 ~ "00-19", | |
age < 40 ~ "20-39", | |
age < 60 ~ "40-59", | |
age < 80 ~ "60-79", | |
TRUE ~ "80+" | |
) | |
age_groups_mortality_org <- rlang::exprs( | |
age < 15 ~ "00-14", | |
age < 65 ~ "15-64", | |
age < 75 ~ "65-74", | |
age < 85 ~ "75-84", | |
TRUE ~ "85+" | |
) | |
# Perform aggregations | |
df5y <- aggregate_age_groups(df, age_groups_5y) | |
# df10y <- aggregate_age_groups(df, age_groups_10y) | |
# df20y <- aggregate_age_groups(df, age_groups_20y) | |
# df_mortality_org <- aggregate_age_groups(df, age_groups_mortality_org) | |
# Use the midpoints of the age groups | |
data <- df5y %>% | |
mutate(midpoint = case_when( | |
age == "85+" ~ 90, # Manually assign the midpoint for "80+" | |
TRUE ~ rowMeans(strsplit(age, "-") %>% lapply(as.numeric) %>% do.call(rbind, .)) | |
)) | |
# Fit a spline model | |
spline_model <- smooth.spline(data$midpoint, data$cmr) | |
# Predict for single ages from 0 to 100 | |
single_ages <- seq(0, 100, by = 1) | |
predicted_cmr <- predict(spline_model, single_ages, deriv = 0)$y | |
# Ensure predictions are non-negative (if needed) | |
predicted_cmr[predicted_cmr < 0] <- 0 | |
dfx <- tibble(age = single_ages, cmr = predicted_cmr) | |
bind_rows( | |
df |> mutate(source = "Actuals") |> select(age, cmr, source), | |
df5y |> disaggregate_lin() |> mutate(source = "0-85+, 5y"), | |
dfx |> mutate(source = "Interpolated (Spline)") | |
) |> | |
pivot_longer(cols = -c(age, source)) |> | |
ggplot(aes(x = age, y = value, color = source)) + | |
geom_line() + | |
scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) + | |
labs( | |
title = "Crude Mortality by single year age groups [USA, Year=2022]", | |
x = "Age", | |
y = "Crude Mortality Rate per 100k (CMR)", | |
color = "Source" | |
) + | |
theme(legend.position = "top") + | |
facet_wrap(vars(name), scales = "free") | |
a <- df |> get_lifetable() | |
b <- dfx |> get_lifetable() | |
cat( | |
sprintf("1) 0-125, single year: %.2f\n", a[1, ]$ex), | |
sprintf( | |
"2) Spline Interpolated; 0-85+, 5y: %.2f (diff: %.2f%%)\n", | |
b[1, ]$ex, ((b[1, ]$ex - a[1, ]$ex) / a[1, ]$ex) * 100 | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment