thoughtfulbloke · March 9, 2022 08:34
diff --git a/cumulative_age_DHB.R b/cumulative_age_DHB.R
 library(rvest)
 library(readr)
 library(dplyr)
 library(tidyr)
 library(lubridate)

 #######
 # assuming the DHB subnational population file downloaded
 # and cached in the working directory from
 # https://figure.nz/table/vEnTmdKKixC0HrEF
 # as Population_Estimated_population_by_sex_age_group_and_DHB_at_June_19962021
 ######

 dhbs_2021 <- read_csv("Population_Estimated_population_by_sex_age_group_and_DHB_at_June_19962021.csv") %>%
  filter(`Year as at 30 June` == 2021, Sex == "Total") %>%
  mutate(Age = case_when(`Age group` == "0-4" ~ "0 to 9",
                         `Age group` == "5-9" ~ "0 to 9",
                         `Age group` == "10-14" ~ "10 to 19",
                         `Age group` == "15-19" ~ "10 to 19",
                         `Age group` == "20-24" ~ "20 to 29",
                         `Age group` == "25-29" ~ "20 to 29",
                         `Age group` == "30-34" ~ "30 to 39",
                         `Age group` == "35-39" ~ "30 to 39",
                         `Age group` == "40-44" ~ "40 to 49",
                         `Age group` == "45-49" ~ "40 to 49",
                         `Age group` == "50-54" ~ "50 to 59",
                         `Age group` == "55-59" ~ "50 to 59",
                         `Age group` == "60-64" ~ "60 to 69",
                         `Age group` == "65-69" ~ "60 to 69",
                         `Age group` == "70-74" ~ "70 to 79",
                         `Age group` == "75-79" ~ "70 to 79",
                         `Age group` == "80-84" ~ "80 to 89",
                         `Age group` == "85-89" ~ "80 to 89",
                         `Age group` == "90-*" ~ "90+"),
         DHB = case_when(`District health board` == "Capital & Coast" ~ "Capital and Coast",
                         `District health board` == "Hutt" ~ "Hutt Valley",
                         `District health board` == "Tairāwhiti" ~ "Tairawhiti",
                         `District health board` == "Waitematā" ~ "Waitemata",
                         TRUE ~ `District health board`)) %>%
  filter(!is.na(Age) , DHB != "New Zealand") %>%
  select(DHB, Age, Value) %>%
  group_by(DHB, Age) %>%
  summarise(population = sum(Value), .groups = "drop")


 #get latest all cases

 lnks <- "https://www.health.govt.nz/covid-19-novel-coronavirus/covid-19-data-and-statistics/covid-19-case-demographics" %>%
  read_html() %>%
  html_nodes("a") %>% html_attr('href') 
 csv_lnk <- paste0("https://www.health.govt.nz",
                  grep("csv$",lnks, value=TRUE))
 NZ_cases <- read_csv(csv_lnk, col_types= cols(
  `Report Date` = col_date(format = ""),
  .default = col_character())) %>% 
  filter(DHB != "Managed Isolation & Quarantine", is.na(Historical),
         DHB != "Unknown") %>%
  count(DHB, Age = `Age group`, Date=`Report Date`,name = "Cases") %>%
  filter(Date > ymd("2022-01-22")) # you may want to change the filter date
 ## add zeros where days are missing
 max_NZ_date = max(NZ_cases$Date)
 NZ_zerod <- expand_grid(DHB = unique(NZ_cases$DHB),
                         Age = unique(NZ_cases$Age),
                         Date = unique(NZ_cases$Date)) %>%
  mutate(Cases = 0) %>%
  bind_rows(NZ_cases) %>%
  arrange(DHB,Age,Date,desc(Cases)) %>%
  group_by(DHB,Age,Date) %>%
  slice(1) %>%
  ungroup() %>%
  inner_join(dhbs_2021, by = c("DHB", "Age")) %>%
  mutate(percent = 100 * Cases / population) %>%
  arrange(DHB,Age,Date) %>%
  group_by(DHB,Age) %>%
  mutate(percent = cumsum(percent)) %>%
  ungroup()
	library(rvest)
	library(readr)
	library(dplyr)
	library(tidyr)
	library(lubridate)

	#######
	# assuming the DHB subnational population file downloaded
	# and cached in the working directory from
	# https://figure.nz/table/vEnTmdKKixC0HrEF
	# as Population_Estimated_population_by_sex_age_group_and_DHB_at_June_19962021
	######

	dhbs_2021 <- read_csv("Population_Estimated_population_by_sex_age_group_and_DHB_at_June_19962021.csv") %>%
	filter(`Year as at 30 June` == 2021, Sex == "Total") %>%
	mutate(Age = case_when(`Age group` == "0-4" ~ "0 to 9",
	`Age group` == "5-9" ~ "0 to 9",
	`Age group` == "10-14" ~ "10 to 19",
	`Age group` == "15-19" ~ "10 to 19",
	`Age group` == "20-24" ~ "20 to 29",
	`Age group` == "25-29" ~ "20 to 29",
	`Age group` == "30-34" ~ "30 to 39",
	`Age group` == "35-39" ~ "30 to 39",
	`Age group` == "40-44" ~ "40 to 49",
	`Age group` == "45-49" ~ "40 to 49",
	`Age group` == "50-54" ~ "50 to 59",
	`Age group` == "55-59" ~ "50 to 59",
	`Age group` == "60-64" ~ "60 to 69",
	`Age group` == "65-69" ~ "60 to 69",
	`Age group` == "70-74" ~ "70 to 79",
	`Age group` == "75-79" ~ "70 to 79",
	`Age group` == "80-84" ~ "80 to 89",
	`Age group` == "85-89" ~ "80 to 89",
	`Age group` == "90-*" ~ "90+"),
	DHB = case_when(`District health board` == "Capital & Coast" ~ "Capital and Coast",
	`District health board` == "Hutt" ~ "Hutt Valley",
	`District health board` == "Tairāwhiti" ~ "Tairawhiti",
	`District health board` == "Waitematā" ~ "Waitemata",
	TRUE ~ `District health board`)) %>%
	filter(!is.na(Age) , DHB != "New Zealand") %>%
	select(DHB, Age, Value) %>%
	group_by(DHB, Age) %>%
	summarise(population = sum(Value), .groups = "drop")


	#get latest all cases

	lnks <- "https://www.health.govt.nz/covid-19-novel-coronavirus/covid-19-data-and-statistics/covid-19-case-demographics" %>%
	read_html() %>%
	html_nodes("a") %>% html_attr('href')
	csv_lnk <- paste0("https://www.health.govt.nz",
	grep("csv$",lnks, value=TRUE))
	NZ_cases <- read_csv(csv_lnk, col_types= cols(
	`Report Date` = col_date(format = ""),
	.default = col_character())) %>%
	filter(DHB != "Managed Isolation & Quarantine", is.na(Historical),
	DHB != "Unknown") %>%
	count(DHB, Age = `Age group`, Date=`Report Date`,name = "Cases") %>%
	filter(Date > ymd("2022-01-22")) # you may want to change the filter date
	## add zeros where days are missing
	max_NZ_date = max(NZ_cases$Date)
	NZ_zerod <- expand_grid(DHB = unique(NZ_cases$DHB),
	Age = unique(NZ_cases$Age),
	Date = unique(NZ_cases$Date)) %>%
	mutate(Cases = 0) %>%
	bind_rows(NZ_cases) %>%
	arrange(DHB,Age,Date,desc(Cases)) %>%
	group_by(DHB,Age,Date) %>%
	slice(1) %>%
	ungroup() %>%
	inner_join(dhbs_2021, by = c("DHB", "Age")) %>%
	mutate(percent = 100 * Cases / population) %>%
	arrange(DHB,Age,Date) %>%
	group_by(DHB,Age) %>%
	mutate(percent = cumsum(percent)) %>%
	ungroup()