jakeybob · June 5, 2021 19:54
diff --git a/generate_scot_pop_daily.R b/generate_scot_pop_daily.R
 library(tidyverse)
 library(janitor)
 library(lubridate)
 library(zoo)
 library(ckanr)

 # generates age/sex population lookup for Scotland based on NRS mid-year
 # population estimates/projections, interpolated linearly to day level 
 # (July 2nd used as mid-year ref date)

 # NOTE: oldest age group will be composite e.g. "90+"

 ckanr_setup(url = "https://www.opendata.nhs.scot/")
 res_est <- resource_show(id = "27a72cc8-d6d8-430c-8b4f-3109a9ceadb1") # pop estimates
 res_proj <- resource_show(id = "0876fc67-05e6-4e87-bc30-c4b0756fff04") # pop projections

 # national population estimates (retrospective)
 data_est <- ckan_fetch(x=res_est$url) %>% 
  clean_names() %>% 
  filter(hb == "S92000003",
         sex != "All") %>%
  select(year, sex, contains("age"), -contains("ages"))

 # national population projections (for years with no estimates available)
 data_proj <- ckan_fetch(x=res_proj$url) %>% 
  clean_names() %>% 
  filter(hb == "S92000003",
         sex != "All") %>%
  select(year, sex, contains("age"), -contains("ages")) %>% 
  filter(year %in% unique(data_est$year) == FALSE)

 # combine and format estimate and projections
 data <- data_est %>% 
  bind_rows(data_proj) %>% 
  distinct() %>% 
  mutate(sex = tolower(sex)) %>% 
  arrange(year, sex) %>% 
  pivot_longer(starts_with("age"), names_to = "age", values_to = "pop") %>%
  mutate(age_desc = str_remove(age, "age")) %>% 
  mutate(age = as.integer(str_extract(age, "\\d+"))) %>% 
  mutate(date = dmy(paste0("02/07/", year))) %>% 
  select(-year)

 # create df of all date/sex/age combinations, join on the population data,
 # then linearly interpolate between all the mid-year points
 df <- crossing(date = seq.Date(from = dmy(paste0("02/07/", min(data_est$year, data_proj$year))),
                               to = dmy(paste0("02/07/", max(data_est$year, data_proj$year))),
                               by = "day"),
               sex = unique(data$sex),
               age = unique(data$age)) %>%
  left_join(data) %>% 
  group_by(sex, age) %>% 
  mutate(pop = na.approx(pop)) %>% 
  ungroup() %>% 
  select(date, sex, age, age_desc, pop)

 # output data
 write_rds(df %>% filter(date >= dmy("01/01/2015"), date <= dmy("01/01/2023")), 
          "scot_pop_daily.rds", compress = "gz")

 # quick plot test
 df %>% 
  filter(date >= dmy("01/01/2015"), date <= today()) %>%
  filter(age %in% c(10:25)) %>%
  # filter(age > 80) %>% 
  ggplot(aes(x = date, y = pop, colour = sex)) +
  geom_line() +
  facet_wrap(~age)
	library(tidyverse)
	library(janitor)
	library(lubridate)
	library(zoo)
	library(ckanr)

	# generates age/sex population lookup for Scotland based on NRS mid-year
	# population estimates/projections, interpolated linearly to day level
	# (July 2nd used as mid-year ref date)

	# NOTE: oldest age group will be composite e.g. "90+"

	ckanr_setup(url = "https://www.opendata.nhs.scot/")
	res_est <- resource_show(id = "27a72cc8-d6d8-430c-8b4f-3109a9ceadb1") # pop estimates
	res_proj <- resource_show(id = "0876fc67-05e6-4e87-bc30-c4b0756fff04") # pop projections

	# national population estimates (retrospective)
	data_est <- ckan_fetch(x=res_est$url) %>%
	clean_names() %>%
	filter(hb == "S92000003",
	sex != "All") %>%
	select(year, sex, contains("age"), -contains("ages"))

	# national population projections (for years with no estimates available)
	data_proj <- ckan_fetch(x=res_proj$url) %>%
	clean_names() %>%
	filter(hb == "S92000003",
	sex != "All") %>%
	select(year, sex, contains("age"), -contains("ages")) %>%
	filter(year %in% unique(data_est$year) == FALSE)

	# combine and format estimate and projections
	data <- data_est %>%
	bind_rows(data_proj) %>%
	distinct() %>%
	mutate(sex = tolower(sex)) %>%
	arrange(year, sex) %>%
	pivot_longer(starts_with("age"), names_to = "age", values_to = "pop") %>%
	mutate(age_desc = str_remove(age, "age")) %>%
	mutate(age = as.integer(str_extract(age, "\\d+"))) %>%
	mutate(date = dmy(paste0("02/07/", year))) %>%
	select(-year)

	# create df of all date/sex/age combinations, join on the population data,
	# then linearly interpolate between all the mid-year points
	df <- crossing(date = seq.Date(from = dmy(paste0("02/07/", min(data_est$year, data_proj$year))),
	to = dmy(paste0("02/07/", max(data_est$year, data_proj$year))),
	by = "day"),
	sex = unique(data$sex),
	age = unique(data$age)) %>%
	left_join(data) %>%
	group_by(sex, age) %>%
	mutate(pop = na.approx(pop)) %>%
	ungroup() %>%
	select(date, sex, age, age_desc, pop)

	# output data
	write_rds(df %>% filter(date >= dmy("01/01/2015"), date <= dmy("01/01/2023")),
	"scot_pop_daily.rds", compress = "gz")

	# quick plot test
	df %>%
	filter(date >= dmy("01/01/2015"), date <= today()) %>%
	filter(age %in% c(10:25)) %>%
	# filter(age > 80) %>%
	ggplot(aes(x = date, y = pop, colour = sex)) +
	geom_line() +
	facet_wrap(~age)