USMortality · December 6, 2024 10:08
diff --git a/compare_disaggreation.r b/compare_disaggreation.r
 library(tidyverse)
 library(demography)
 library(scales)

 sf <- 2
 width <- 600 * sf
 height <- 335 * sf

 options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))


 ### Deaths
 deaths <- read_csv(paste0(
  "https://gist.githubusercontent.com/USMortality/",
  "2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
  "data_usa_deaths_single_years.csv"
 ), col_types = "cciiiciiciiiccci") |>
  select(Age, Deaths) |>
  setNames(c("age", "deaths")) |>
  filter(!is.na(age)) |>
  group_by(age) |>
  summarize(deaths = sum(deaths))


 ### Population
 pop <- read_csv(paste0(
  "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
  "national/asrh/nc-est2023-agesex-res.csv"
 ), col_types = "iiiiiii") |>
  filter(SEX == 0, AGE %in% 0:100) |>
  select(AGE, POPESTIMATE2022) |>
  setNames(c("age", "population")) |>
  bind_rows(tibble(age = 101:125, population = NA))

 # Interpolate 100+ population to 100-125
 w <- exp(-0.66 * 0:25)
 pop$w <- c(rep(NA, 100), w / sum(w))
 remainder <- pop$population[pop$age == 100]
 pop$population[pop$age == 100] <- NA
 pop <- pop |>
  mutate(population = ifelse(!is.na(population),
    population, ceiling(w * remainder)
  )) |>
  select(-w)

 ### Combined Dataset
 df <- deaths |>
  right_join(pop, by = join_by(age)) |>
  mutate(across(everything(), as.integer),
    deaths = ifelse(is.na(deaths), 0, deaths)
  ) |>
  arrange(age) |>
  filter(row_number() <= max(which(deaths > 0)) | deaths > 0)


 ### 1) Single ages, 0-125
 usa_demog <- demogdata(
  data = matrix(df$deaths / df$population, ncol = 1),
  pop = matrix(df$population, ncol = 1),
  age = matrix(df$age, ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )
 usa_life_table <- lifetable(usa_demog)
 result_1 <- usa_life_table$ex[1]


 ### 2) Single ages, 0-85+
 df2 <- df |>
  mutate(age = ifelse(age >= 85, "85+", as.character(age))) |>
  group_by(age) |>
  summarize(
    deaths = sum(deaths),
    population = sum(population), .groups = "drop"
  )

 # Evenly split 85+ to 85-100
 ages <- 85:100
 df2 <- bind_rows(
  df2, tibble(
    age = as.character(ages),
    deaths = rep(df2$deaths[df2$age == "85+"] / length(ages), length(ages)),
    population =
      rep(df2$population[df2$age == "85+"] / length(ages), length(ages))
  )
 ) |>
  filter(age != "85+") |>
  mutate(age = as.integer(age)) |>
  dplyr::arrange(age)

 usa_demog <- demogdata(
  data = matrix(df2$deaths / df2$population, ncol = 1),
  pop = matrix(df2$population, ncol = 1),
  age = matrix(df2$age, ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )
 usa_life_table <- lifetable(usa_demog)
 result_2a <- usa_life_table$ex[1]


 usa_demog <- demogdata(
  data = matrix(df2$deaths / df$population[0:101], ncol = 1),
  pop = matrix(df$population[0:101], ncol = 1),
  age = matrix(df$age[0:101], ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )
 usa_life_table <- lifetable(usa_demog)
 result_2b <- usa_life_table$ex[1]



 ### 3) 5y age groups, 0-85+
 df3 <- df |>
  mutate(
    age = case_when(
      age < 5 ~ "0-4",
      age < 10 ~ "5-9",
      age < 15 ~ "10-14",
      age < 20 ~ "15-19",
      age < 25 ~ "20-24",
      age < 30 ~ "25-29",
      age < 35 ~ "30-34",
      age < 40 ~ "35-39",
      age < 45 ~ "40-44",
      age < 50 ~ "45-49",
      age < 55 ~ "50-54",
      age < 60 ~ "55-59",
      age < 65 ~ "60-64",
      age < 70 ~ "65-69",
      age < 75 ~ "70-74",
      age < 80 ~ "75-79",
      age < 85 ~ "80-84",
      TRUE ~ "85+"
    )
  ) |>
  group_by(age) |>
  summarize(
    deaths = sum(deaths), population = sum(population), .groups = "drop"
  )

 # Function to disaggregate a single row into individual years
 disaggregate_row <- function(age_range, deaths, population) {
  if (age_range == "85+") {
    years <- 85:100 # Define the single-year range for 85+
  } else {
    years <- as.numeric(
      unlist(strsplit(age_range, "-"))[1]
    ):as.numeric(unlist(strsplit(age_range, "-"))[2])
  }
  tibble(
    age = years,
    deaths = rep(deaths / length(years), length(years)),
    population = rep(population / length(years), length(years))
  )
 }

 # Apply the disaggregation to all rows
 df3 <- df3 %>%
  rowwise() %>%
  do(disaggregate_row(.$age, .$deaths, .$population)) %>%
  ungroup()

 usa_demog <- demogdata(
  data = matrix(df3$deaths / df3$population, ncol = 1),
  pop = matrix(df3$population, ncol = 1),
  age = matrix(df3$age, ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )
 usa_life_table <- lifetable(usa_demog)
 result_3a <- usa_life_table$ex[1]


 usa_demog <- demogdata(
  data = matrix(df3$deaths / df$population[0:101], ncol = 1),
  pop = matrix(df$population[0:101], ncol = 1),
  age = matrix(df$age[0:101], ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )
 usa_life_table <- lifetable(usa_demog)
 result_3b <- usa_life_table$ex[1]



 # Plot
 chart <- bind_rows(
  df |> mutate(source = "Actuals"),
  df2 |> mutate(source = "0-85+, single years"),
  df3 |> mutate(source = "0-85+, 5y age groups")
 ) |>
  pivot_longer(cols = -c(age, source)) |>
  ggplot(aes(x = age, y = value, color = source)) +
  geom_line() +
  scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    title = "Deaths & Populations by single age groups [USA, Year=2022]",
    x = "Age",
    y = "",
    color = "Source"
  ) +
  theme(legend.position = "top") +
  facet_wrap(vars(name), scales = "free")

 ggsave(
  filename = "chart1.png",
  plot = chart,
  width = width,
  height = height,
  units = "px",
  dpi = 72 * sf,
  device = grDevices::png,
  type = "cairo"
 )

 # Compare results:
 cat(
  sprintf("1) 0-125, single year: %.2f\n", result_1),
  sprintf(
    "2a) 0-85+, single year: %.2f (diff: %.2f%%)\n",
    result_2a, ((result_2a - result_1) / result_1) * 100
  ),
  sprintf(
    "2b) 0-85+, single year (pop. 0-121): %.2f (diff: %.2f%%)\n",
    result_2b, ((result_2b - result_1) / result_1) * 100
  ),
  sprintf(
    "3a) 0-85+, 5y age groups: %.2f (diff: %.2f%%)\n",
    result_3a, ((result_3a - result_1) / result_1) * 100
  ),
  sprintf(
    "3b) 0-85+, 5y age groups (populations 0-121): %.2f (diff: %.2f%%)\n",
    result_3b, ((result_3b - result_1) / result_1) * 100
  )
 )
diff --git a/sample_le_birth.r b/sample_le_birth.r
 library(tidyverse)

 sf <- 2
 width <- 600 * sf
 height <- 335 * sf

 options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))


 ### Deaths
 deaths <- read_csv(paste0(
  "https://gist.githubusercontent.com/USMortality/",
  "2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
  "data_usa_deaths_single_years.csv"
 ), col_types = "cciiiciiciiiccci") |>
  select(Age, Deaths) |>
  setNames(c("age", "deaths")) |>
  filter(!is.na(age)) |>
  group_by(age) |>
  summarize(deaths = sum(deaths))


 ### Population
 pop <- read_csv(paste0(
  "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
  "national/asrh/nc-est2023-agesex-res.csv"
 ), col_types = "iiiiiii") |>
  filter(SEX == 0, AGE %in% 0:100) |>
  select(AGE, POPESTIMATE2022) |>
  setNames(c("age", "population")) |>
  bind_rows(tibble(age = 101:125, population = NA))

 # Interpolate 100+ population to 100-125
 w <- exp(-0.66 * 0:25)
 pop$w <- c(rep(NA, 100), w / sum(w))
 remainder <- pop$population[pop$age == 100]
 pop$population[pop$age == 100] <- NA
 pop <- pop |>
  mutate(population = ifelse(!is.na(population),
    population, ceiling(w * remainder)
  )) |>
  select(-w)

 ### Combined Dataset
 df <- deaths |>
  right_join(pop, by = join_by(age)) |>
  mutate(across(everything(), as.integer),
    deaths = ifelse(is.na(deaths), 0, deaths),
    cmr = deaths / population * 100000
  ) |>
  arrange(age) |>
  filter(row_number() <= max(which(deaths > 0)) | deaths > 0)

 # Plot
 quartz(width = 6, height = 3)
 df |>
  filter(age > 70) |>
  pivot_longer(!1) |>
  ggplot(aes(x = age, y = value)) +
  geom_line() +
  scale_y_log10() +
  labs(title = "Crude Mortality by Single Years of Age [USA, Year=2022]") +
  facet_wrap(vars(name))


 ### Calculate LE
 # Method: Hyndman et al.
 library(demography)

 usa_demog <- demogdata(
  data = matrix(df$deaths / df$population, ncol = 1),
  pop = matrix(df$population, ncol = 1),
  age = matrix(df$age, ncol = 1),
  label = "USA",
  type = "mortality",
  name = "total",
  years = 2022
 )

 usa_life_table <- lifetable(usa_demog)
 # LE at Birth:
 result_1 <- usa_life_table$ex[1]


 # Method: Ulf Lorré
 ages <- df$age + 0.5
 mx <- df$deaths / df$population
 qx <- mx / (1 + mx / 2)
 p <- rep(1, length(ages))
 d <- rep(0, length(ages))

 for (a in 1:(length(ages) - 1)) {
  d[a] <- p[a] * qx[a]
  p[a + 1] <- max(0, p[a] - d[a])
 }

 d[length(ages)] <- p[length(ages)] * qx[length(ages)]
 result_2 <- weighted.mean(ages, d)


 # Method Life Table
 life_table <- df |>
  mutate(
    # Mortality hazard rate (average rate of dying during the interval)
    mx = deaths / population,
    # Crude probability of dying (approximation, assumes low mortality rates)
    # qx = mx,
    # Adjusted probability of dying (accounts for timing of deaths during the interval)
    qx = mx / (1 + mx / 2),
    px = 1 - qx, # Survival probability
    # Survivors at start of age interval (assume l0 = 100,000)
    lx = lag(cumprod(px), default = 1) * 100000,
    dx = lx * qx, # Number of deaths in the interval
    Lx = lx - 0.5 * dx, # Person-years lived in the interval
    Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x
    ex = Tx / lx # Life expectancy at age x
  )

 result_3 <- life_table[1, ]$ex

 ### Henjin
 age <- df$age
 mx <- df$cmr / 100000

 # Probability of dying (adjusted for exposure time)
 qx <- mx / (1 + mx / 2)
 qx <- pmin(qx, 1) # Ensure qx does not exceed 1
 px <- 1 - qx # Probability of surviving

 # Survivors at start of each age group
 lx <- numeric(length(age))
 lx[1] <- 1e5 # Starting population (arbitrary base of 100,000)
 for (i in 2:length(age)) {
  lx[i] <- lx[i - 1] * px[i - 1]
 }

 # Number of deaths in each age group
 dx <- lx * qx

 # Person-years lived in each age group
 Lx <- lx - 0.5 * dx
 # Adjust last age group's person-years lived
 if (mx[length(mx)] > 0) {
  Lx[length(Lx)] <- lx[length(Lx)] / mx[length(mx)]
 } else {
  Lx[length(Lx)] <- lx[length(Lx)] # If mx is zero, assume no deaths
 }

 # Cumulative person-years remaining beyond each age
 Tx <- numeric(length(age))
 Tx[length(Tx)] <- Lx[length(Lx)]
 for (i in (length(age) - 1):1) {
  Tx[i] <- Tx[i + 1] + Lx[i]
 }

 # Life expectancy at each age
 ex <- Tx / lx

 # Return the life table as a data frame
 life_table <- data.frame(age, mx, qx, px, lx, dx, Lx, Tx, ex)

 result_4 <- life_table[1, ]$ex


 cat(
  sprintf("Hyndman: %f\n", result_1),
  sprintf("Ulf Lorré: %f\n", result_2),
  sprintf("ChatGPT (Ben): %f\n", result_3),
  sprintf("ChatGPT (Henjin): %f\n", result_4)
 )
diff --git a/sample_spline_interpolation.r b/sample_spline_interpolation.r
 library(splines)

 library(tidyverse)
 library(demography)
 library(scales)

 sf <- 2
 width <- 600 * sf
 height <- 335 * sf

 options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))


 ### Deaths
 deaths <- read_csv(paste0(
  "https://gist.githubusercontent.com/USMortality/",
  "2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
  "data_usa_deaths_single_years.csv"
 ), col_types = "cciiiciiciiiccci") |>
  select(Age, Deaths) |>
  setNames(c("age", "deaths")) |>
  filter(!is.na(age)) |>
  group_by(age) |>
  summarize(deaths = sum(deaths))


 ### Population
 pop <- read_csv(paste0(
  "https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
  "national/asrh/nc-est2023-agesex-res.csv"
 ), col_types = "iiiiiii") |>
  filter(SEX == 0, AGE %in% 0:100) |>
  select(AGE, POPESTIMATE2022) |>
  setNames(c("age", "population")) |>
  bind_rows(tibble(age = 101:125, population = NA))

 # Interpolate 100+ population to 100-125
 w <- exp(-0.66 * 0:25)
 pop$w <- c(rep(NA, 100), w / sum(w))
 remainder <- pop$population[pop$age == 100]
 pop$population[pop$age == 100] <- NA
 pop <- pop |>
  mutate(population = ifelse(!is.na(population),
    population, ceiling(w * remainder)
  )) |>
  select(-w)

 ### Combined Dataset
 df <- deaths |>
  right_join(pop, by = join_by(age)) |>
  mutate(across(everything(), as.integer),
    deaths = ifelse(is.na(deaths), 0, deaths)
  ) |>
  arrange(age) |>
  filter(row_number() <= max(which(deaths > 0)) | deaths > 0) |>
  mutate(cmr = deaths / population * 100000)

 ### Functions

 # Aggregate single age data into age groups
 aggregate_age_groups <- function(df, age_groups) {
  df |>
    mutate(age = case_when(!!!age_groups)) |>
    group_by(age) |>
    summarize(
      deaths = sum(deaths, na.rm = TRUE),
      population = sum(population, na.rm = TRUE),
      .groups = "drop"
    ) |>
    mutate(cmr = deaths / population * 100000) |>
    select(age, cmr)
 }

 # Function to disaggregate a single row into individual years
 disaggregate_row_lin <- function(age_range, cmr) {
  if (grepl("\\+", age_range)) { # Check if age_range includes a plus sign
    start_age <- as.numeric(sub("\\+.*", "", age_range))
    years <- start_age:100
  } else {
    years <- as.numeric(
      unlist(strsplit(age_range, "-"))[1]
    ):as.numeric(unlist(strsplit(age_range, "-"))[2])
  }

  tibble(age = years, cmr = rep(cmr, length(years)))
 }

 disaggregate_lin <- function(df) {
  df |>
    rowwise() |>
    do(disaggregate_row_lin(.$age, .$cmr)) |>
    ungroup()
 }

 get_lifetable <- function(df) {
  df |>
    mutate(
      # Mortality hazard rate (average rate of dying during the interval)
      mx = cmr / 100000,
      # Adjusted probability of dying (accounts for timing of deaths during the interval)
      qx = mx / (1 + mx / 2),
      px = 1 - qx, # Survival probability
      # Survivors at start of age interval (assume l0 = 100,000)
      lx = lag(cumprod(px), default = 1) * 100000,
      dx = lx * qx, # Number of deaths in the interval
      Lx = lx - 0.5 * dx, # Person-years lived in the interval
      Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x
      ex = Tx / lx # Life expectancy at age x
    )
 }

 # Age group definitions
 age_groups_5y <- rlang::exprs(
  age < 5 ~ "00-04",
  age < 10 ~ "05-09",
  age < 15 ~ "10-14",
  age < 20 ~ "15-19",
  age < 25 ~ "20-24",
  age < 30 ~ "25-29",
  age < 35 ~ "30-34",
  age < 40 ~ "35-39",
  age < 45 ~ "40-44",
  age < 50 ~ "45-49",
  age < 55 ~ "50-54",
  age < 60 ~ "55-59",
  age < 65 ~ "60-64",
  age < 70 ~ "65-69",
  age < 75 ~ "70-74",
  age < 80 ~ "75-79",
  age < 85 ~ "80-84",
  TRUE ~ "85+"
 )

 age_groups_10y <- rlang::exprs(
  age < 10 ~ "00-09",
  age < 20 ~ "10-19",
  age < 30 ~ "20-29",
  age < 40 ~ "30-39",
  age < 50 ~ "40-49",
  age < 60 ~ "50-59",
  age < 70 ~ "60-69",
  age < 80 ~ "70-79",
  TRUE ~ "80+"
 )

 age_groups_20y <- rlang::exprs(
  age < 20 ~ "00-19",
  age < 40 ~ "20-39",
  age < 60 ~ "40-59",
  age < 80 ~ "60-79",
  TRUE ~ "80+"
 )

 age_groups_mortality_org <- rlang::exprs(
  age < 15 ~ "00-14",
  age < 65 ~ "15-64",
  age < 75 ~ "65-74",
  age < 85 ~ "75-84",
  TRUE ~ "85+"
 )

 # Perform aggregations
 df5y <- aggregate_age_groups(df, age_groups_5y)
 # df10y <- aggregate_age_groups(df, age_groups_10y)
 # df20y <- aggregate_age_groups(df, age_groups_20y)
 # df_mortality_org <- aggregate_age_groups(df, age_groups_mortality_org)

 # Use the midpoints of the age groups
 data <- df5y %>%
  mutate(midpoint = case_when(
    age == "85+" ~ 90, # Manually assign the midpoint for "80+"
    TRUE ~ rowMeans(strsplit(age, "-") %>% lapply(as.numeric) %>% do.call(rbind, .))
  ))

 # Fit a spline model
 spline_model <- smooth.spline(data$midpoint, data$cmr)

 # Predict for single ages from 0 to 100
 single_ages <- seq(0, 100, by = 1)
 predicted_cmr <- predict(spline_model, single_ages, deriv = 0)$y

 # Ensure predictions are non-negative (if needed)
 predicted_cmr[predicted_cmr < 0] <- 0

 dfx <- tibble(age = single_ages, cmr = predicted_cmr)


 bind_rows(
  df |> mutate(source = "Actuals") |> select(age, cmr, source),
  df5y |> disaggregate_lin() |> mutate(source = "0-85+, 5y"),
  dfx |> mutate(source = "Interpolated (Spline)")
 ) |>
  pivot_longer(cols = -c(age, source)) |>
  ggplot(aes(x = age, y = value, color = source)) +
  geom_line() +
  scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    title = "Crude Mortality by single year age groups [USA, Year=2022]",
    x = "Age",
    y = "Crude Mortality Rate per 100k (CMR)",
    color = "Source"
  ) +
  theme(legend.position = "top") +
  facet_wrap(vars(name), scales = "free")

 a <- df |> get_lifetable()
 b <- dfx |> get_lifetable()

 cat(
  sprintf("1) 0-125, single year: %.2f\n", a[1, ]$ex),
  sprintf(
    "2) Spline Interpolated; 0-85+, 5y: %.2f (diff: %.2f%%)\n",
    b[1, ]$ex, ((b[1, ]$ex - a[1, ]$ex) / a[1, ]$ex) * 100
  )
 )
	library(tidyverse)
	library(demography)
	library(scales)

	sf <- 2
	width <- 600 * sf
	height <- 335 * sf

	options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))


	### Deaths
	deaths <- read_csv(paste0(
	"https://gist.githubusercontent.com/USMortality/",
	"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
	"data_usa_deaths_single_years.csv"
	), col_types = "cciiiciiciiiccci") \|>
	select(Age, Deaths) \|>
	setNames(c("age", "deaths")) \|>
	filter(!is.na(age)) \|>
	group_by(age) \|>
	summarize(deaths = sum(deaths))


	### Population
	pop <- read_csv(paste0(
	"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
	"national/asrh/nc-est2023-agesex-res.csv"
	), col_types = "iiiiiii") \|>
	filter(SEX == 0, AGE %in% 0:100) \|>
	select(AGE, POPESTIMATE2022) \|>
	setNames(c("age", "population")) \|>
	bind_rows(tibble(age = 101:125, population = NA))

	# Interpolate 100+ population to 100-125
	w <- exp(-0.66 * 0:25)
	pop$w <- c(rep(NA, 100), w / sum(w))
	remainder <- pop$population[pop$age == 100]
	pop$population[pop$age == 100] <- NA
	pop <- pop \|>
	mutate(population = ifelse(!is.na(population),
	population, ceiling(w * remainder)
	)) \|>
	select(-w)

	### Combined Dataset
	df <- deaths \|>
	right_join(pop, by = join_by(age)) \|>
	mutate(across(everything(), as.integer),
	deaths = ifelse(is.na(deaths), 0, deaths)
	) \|>
	arrange(age) \|>
	filter(row_number() <= max(which(deaths > 0)) \| deaths > 0)


	### 1) Single ages, 0-125
	usa_demog <- demogdata(
	data = matrix(df$deaths / df$population, ncol = 1),
	pop = matrix(df$population, ncol = 1),
	age = matrix(df$age, ncol = 1),
	label = "USA",
	type = "mortality",
	name = "total",
	years = 2022
	)
	usa_life_table <- lifetable(usa_demog)
	result_1 <- usa_life_table$ex[1]


	### 2) Single ages, 0-85+
	df2 <- df \|>
	mutate(age = ifelse(age >= 85, "85+", as.character(age))) \|>
	group_by(age) \|>
	summarize(
	deaths = sum(deaths),
	population = sum(population), .groups = "drop"
	)

	# Evenly split 85+ to 85-100
	ages <- 85:100
	df2 <- bind_rows(
	df2, tibble(
	age = as.character(ages),
	deaths = rep(df2$deaths[df2$age == "85+"] / length(ages), length(ages)),
	population =
	rep(df2$population[df2$age == "85+"] / length(ages), length(ages))
	)
	) \|>
	filter(age != "85+") \|>
	mutate(age = as.integer(age)) \|>
	dplyr::arrange(age)

	usa_demog <- demogdata(
	data = matrix(df2$deaths / df2$population, ncol = 1),
	pop = matrix(df2$population, ncol = 1),
	age = matrix(df2$age, ncol = 1),
	label = "USA",
	type = "mortality",
	name = "total",
	years = 2022
	)
	usa_life_table <- lifetable(usa_demog)
	result_2a <- usa_life_table$ex[1]


	usa_demog <- demogdata(
	data = matrix(df2$deaths / df$population[0:101], ncol = 1),
	pop = matrix(df$population[0:101], ncol = 1),
	age = matrix(df$age[0:101], ncol = 1),
	label = "USA",
	type = "mortality",
	name = "total",
	years = 2022
	)
	usa_life_table <- lifetable(usa_demog)
	result_2b <- usa_life_table$ex[1]



	### 3) 5y age groups, 0-85+
	df3 <- df \|>
	mutate(
	age = case_when(
	age < 5 ~ "0-4",
	age < 10 ~ "5-9",
	age < 15 ~ "10-14",
	age < 20 ~ "15-19",
	age < 25 ~ "20-24",
	age < 30 ~ "25-29",
	age < 35 ~ "30-34",
	age < 40 ~ "35-39",
	age < 45 ~ "40-44",
	age < 50 ~ "45-49",
	age < 55 ~ "50-54",
	age < 60 ~ "55-59",
	age < 65 ~ "60-64",
	age < 70 ~ "65-69",
	age < 75 ~ "70-74",
	age < 80 ~ "75-79",
	age < 85 ~ "80-84",
	TRUE ~ "85+"
	)
	) \|>
	group_by(age) \|>
	summarize(
	deaths = sum(deaths), population = sum(population), .groups = "drop"
	)

	# Function to disaggregate a single row into individual years
	disaggregate_row <- function(age_range, deaths, population) {
	if (age_range == "85+") {
	years <- 85:100 # Define the single-year range for 85+
	} else {
	years <- as.numeric(
	unlist(strsplit(age_range, "-"))[1]
	):as.numeric(unlist(strsplit(age_range, "-"))[2])
	}
	tibble(
	age = years,
	deaths = rep(deaths / length(years), length(years)),
	population = rep(population / length(years), length(years))
	)
	}

	# Apply the disaggregation to all rows
	df3 <- df3 %>%
	rowwise() %>%
	do(disaggregate_row(.$age, .$deaths, .$population)) %>%
	ungroup()

	usa_demog <- demogdata(
	data = matrix(df3$deaths / df3$population, ncol = 1),
	pop = matrix(df3$population, ncol = 1),
	age = matrix(df3$age, ncol = 1),
	label = "USA",
	type = "mortality",
	name = "total",
	years = 2022
	)
	usa_life_table <- lifetable(usa_demog)
	result_3a <- usa_life_table$ex[1]


	usa_demog <- demogdata(
	data = matrix(df3$deaths / df$population[0:101], ncol = 1),
	pop = matrix(df$population[0:101], ncol = 1),
	age = matrix(df$age[0:101], ncol = 1),
	label = "USA",
	type = "mortality",
	name = "total",
	years = 2022
	)
	usa_life_table <- lifetable(usa_demog)
	result_3b <- usa_life_table$ex[1]



	# Plot
	chart <- bind_rows(
	df \|> mutate(source = "Actuals"),
	df2 \|> mutate(source = "0-85+, single years"),
	df3 \|> mutate(source = "0-85+, 5y age groups")
	) \|>
	pivot_longer(cols = -c(age, source)) \|>
	ggplot(aes(x = age, y = value, color = source)) +
	geom_line() +
	scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
	labs(
	title = "Deaths & Populations by single age groups [USA, Year=2022]",
	x = "Age",
	y = "",
	color = "Source"
	) +
	theme(legend.position = "top") +
	facet_wrap(vars(name), scales = "free")

	ggsave(
	filename = "chart1.png",
	plot = chart,
	width = width,
	height = height,
	units = "px",
	dpi = 72 * sf,
	device = grDevices::png,
	type = "cairo"
	)

	# Compare results:
	cat(
	sprintf("1) 0-125, single year: %.2f\n", result_1),
	sprintf(
	"2a) 0-85+, single year: %.2f (diff: %.2f%%)\n",
	result_2a, ((result_2a - result_1) / result_1) * 100
	),
	sprintf(
	"2b) 0-85+, single year (pop. 0-121): %.2f (diff: %.2f%%)\n",
	result_2b, ((result_2b - result_1) / result_1) * 100
	),
	sprintf(
	"3a) 0-85+, 5y age groups: %.2f (diff: %.2f%%)\n",
	result_3a, ((result_3a - result_1) / result_1) * 100
	),
	sprintf(
	"3b) 0-85+, 5y age groups (populations 0-121): %.2f (diff: %.2f%%)\n",
	result_3b, ((result_3b - result_1) / result_1) * 100
	)
	)
	library(splines)

	library(tidyverse)
	library(demography)
	library(scales)

	sf <- 2
	width <- 600 * sf
	height <- 335 * sf

	options(vsc.dev.args = list(width = width, height = height, res = 72 * sf))


	### Deaths
	deaths <- read_csv(paste0(
	"https://gist.githubusercontent.com/USMortality/",
	"2b1b8fcc59d7b717d5b7b16e3d408ff7/raw/",
	"data_usa_deaths_single_years.csv"
	), col_types = "cciiiciiciiiccci") \|>
	select(Age, Deaths) \|>
	setNames(c("age", "deaths")) \|>
	filter(!is.na(age)) \|>
	group_by(age) \|>
	summarize(deaths = sum(deaths))


	### Population
	pop <- read_csv(paste0(
	"https://www2.census.gov/programs-surveys/popest/datasets/2020-2023/",
	"national/asrh/nc-est2023-agesex-res.csv"
	), col_types = "iiiiiii") \|>
	filter(SEX == 0, AGE %in% 0:100) \|>
	select(AGE, POPESTIMATE2022) \|>
	setNames(c("age", "population")) \|>
	bind_rows(tibble(age = 101:125, population = NA))

	# Interpolate 100+ population to 100-125
	w <- exp(-0.66 * 0:25)
	pop$w <- c(rep(NA, 100), w / sum(w))
	remainder <- pop$population[pop$age == 100]
	pop$population[pop$age == 100] <- NA
	pop <- pop \|>
	mutate(population = ifelse(!is.na(population),
	population, ceiling(w * remainder)
	)) \|>
	select(-w)

	### Combined Dataset
	df <- deaths \|>
	right_join(pop, by = join_by(age)) \|>
	mutate(across(everything(), as.integer),
	deaths = ifelse(is.na(deaths), 0, deaths)
	) \|>
	arrange(age) \|>
	filter(row_number() <= max(which(deaths > 0)) \| deaths > 0) \|>
	mutate(cmr = deaths / population * 100000)

	### Functions

	# Aggregate single age data into age groups
	aggregate_age_groups <- function(df, age_groups) {
	df \|>
	mutate(age = case_when(!!!age_groups)) \|>
	group_by(age) \|>
	summarize(
	deaths = sum(deaths, na.rm = TRUE),
	population = sum(population, na.rm = TRUE),
	.groups = "drop"
	) \|>
	mutate(cmr = deaths / population * 100000) \|>
	select(age, cmr)
	}

	# Function to disaggregate a single row into individual years
	disaggregate_row_lin <- function(age_range, cmr) {
	if (grepl("\\+", age_range)) { # Check if age_range includes a plus sign
	start_age <- as.numeric(sub("\\+.*", "", age_range))
	years <- start_age:100
	} else {
	years <- as.numeric(
	unlist(strsplit(age_range, "-"))[1]
	):as.numeric(unlist(strsplit(age_range, "-"))[2])
	}

	tibble(age = years, cmr = rep(cmr, length(years)))
	}

	disaggregate_lin <- function(df) {
	df \|>
	rowwise() \|>
	do(disaggregate_row_lin(.$age, .$cmr)) \|>
	ungroup()
	}

	get_lifetable <- function(df) {
	df \|>
	mutate(
	# Mortality hazard rate (average rate of dying during the interval)
	mx = cmr / 100000,
	# Adjusted probability of dying (accounts for timing of deaths during the interval)
	qx = mx / (1 + mx / 2),
	px = 1 - qx, # Survival probability
	# Survivors at start of age interval (assume l0 = 100,000)
	lx = lag(cumprod(px), default = 1) * 100000,
	dx = lx * qx, # Number of deaths in the interval
	Lx = lx - 0.5 * dx, # Person-years lived in the interval
	Tx = rev(cumsum(rev(Lx))), # Total person-years lived beyond age x
	ex = Tx / lx # Life expectancy at age x
	)
	}

	# Age group definitions
	age_groups_5y <- rlang::exprs(
	age < 5 ~ "00-04",
	age < 10 ~ "05-09",
	age < 15 ~ "10-14",
	age < 20 ~ "15-19",
	age < 25 ~ "20-24",
	age < 30 ~ "25-29",
	age < 35 ~ "30-34",
	age < 40 ~ "35-39",
	age < 45 ~ "40-44",
	age < 50 ~ "45-49",
	age < 55 ~ "50-54",
	age < 60 ~ "55-59",
	age < 65 ~ "60-64",
	age < 70 ~ "65-69",
	age < 75 ~ "70-74",
	age < 80 ~ "75-79",
	age < 85 ~ "80-84",
	TRUE ~ "85+"
	)

	age_groups_10y <- rlang::exprs(
	age < 10 ~ "00-09",
	age < 20 ~ "10-19",
	age < 30 ~ "20-29",
	age < 40 ~ "30-39",
	age < 50 ~ "40-49",
	age < 60 ~ "50-59",
	age < 70 ~ "60-69",
	age < 80 ~ "70-79",
	TRUE ~ "80+"
	)

	age_groups_20y <- rlang::exprs(
	age < 20 ~ "00-19",
	age < 40 ~ "20-39",
	age < 60 ~ "40-59",
	age < 80 ~ "60-79",
	TRUE ~ "80+"
	)

	age_groups_mortality_org <- rlang::exprs(
	age < 15 ~ "00-14",
	age < 65 ~ "15-64",
	age < 75 ~ "65-74",
	age < 85 ~ "75-84",
	TRUE ~ "85+"
	)

	# Perform aggregations
	df5y <- aggregate_age_groups(df, age_groups_5y)
	# df10y <- aggregate_age_groups(df, age_groups_10y)
	# df20y <- aggregate_age_groups(df, age_groups_20y)
	# df_mortality_org <- aggregate_age_groups(df, age_groups_mortality_org)

	# Use the midpoints of the age groups
	data <- df5y %>%
	mutate(midpoint = case_when(
	age == "85+" ~ 90, # Manually assign the midpoint for "80+"
	TRUE ~ rowMeans(strsplit(age, "-") %>% lapply(as.numeric) %>% do.call(rbind, .))
	))

	# Fit a spline model
	spline_model <- smooth.spline(data$midpoint, data$cmr)

	# Predict for single ages from 0 to 100
	single_ages <- seq(0, 100, by = 1)
	predicted_cmr <- predict(spline_model, single_ages, deriv = 0)$y

	# Ensure predictions are non-negative (if needed)
	predicted_cmr[predicted_cmr < 0] <- 0

	dfx <- tibble(age = single_ages, cmr = predicted_cmr)


	bind_rows(
	df \|> mutate(source = "Actuals") \|> select(age, cmr, source),
	df5y \|> disaggregate_lin() \|> mutate(source = "0-85+, 5y"),
	dfx \|> mutate(source = "Interpolated (Spline)")
	) \|>
	pivot_longer(cols = -c(age, source)) \|>
	ggplot(aes(x = age, y = value, color = source)) +
	geom_line() +
	scale_y_log10(labels = label_number(scale_cut = cut_short_scale())) +
	labs(
	title = "Crude Mortality by single year age groups [USA, Year=2022]",
	x = "Age",
	y = "Crude Mortality Rate per 100k (CMR)",
	color = "Source"
	) +
	theme(legend.position = "top") +
	facet_wrap(vars(name), scales = "free")

	a <- df \|> get_lifetable()
	b <- dfx \|> get_lifetable()

	cat(
	sprintf("1) 0-125, single year: %.2f\n", a[1, ]$ex),
	sprintf(
	"2) Spline Interpolated; 0-85+, 5y: %.2f (diff: %.2f%%)\n",
	b[1, ]$ex, ((b[1, ]$ex - a[1, ]$ex) / a[1, ]$ex) * 100
	)
	)