Created
September 12, 2020 20:27
-
-
Save jakeybob/61bc2533e096cd28ad63d0743de137cd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(stats19) | |
library(ckanr) | |
# libs for mapping accident lat/long to datazone for Glasgow deprivation analysis | |
library(sf) | |
library(rmapshaper) | |
# below libs useful for speeding up mapping accidents over water to nearest datazone | |
library(foreach) | |
library(doParallel) | |
registerDoParallel(parallel::detectCores()) | |
### LOOKUPS & RECODES #### | |
ckanr_setup(url = "https://www.opendata.nhs.scot/") | |
res_ca_ids <- resource_show(id = "967937c4-8d67-4f39-974f-fd58c4acfda5") | |
ca_ids <- ckan_fetch(x=res_ca_ids$url) | |
res_dz_ids <- resource_show(id = "395476ab-0720-4740-be07-ff4467141352") | |
dz_ids <- ckan_fetch(x=res_dz_ids$url) | |
res_simd <- resource_show(id = "cadf715a-c365-4dcf-a6e0-acd7e3af21ec") | |
simd <- ckan_fetch(x=res_simd$url) | |
res_ca_pop <- resource_show(id = "09ebfefb-33f4-4f6a-8312-2d14e2b02ace") | |
ca_pop <- ckan_fetch(x=res_ca_pop$url) | |
res_dz_pop <- resource_show(id = "c505f490-c201-44bd-abd1-1bd7a64285ee") | |
dz_pop <- ckan_fetch(x=res_dz_pop$url) | |
area_recodes <- c("Glasgow City" = "Glasgow", | |
"City of Edinburgh" = "Edinburgh", | |
"Edinburgh, City of" = "Edinburgh", | |
"Aberdeen City" = "Aberdeen", | |
"Dundee City" = "Dundee", | |
"Newcastle upon Tyne" = "Newcastle", | |
"Bristol, City of" = "Bristol", | |
"Great Britain" = "UK") | |
scot_areas <- c("Scotland", "Glasgow", "Edinburgh", "Aberdeen", "Dundee") | |
gcr_areas <- c("Scotland", "Glasgow", "Inverclyde", "South Lanarkshire", "North Lanarkshire", "East Dunbartonshire", | |
"West Dunbartonshire", "East Renfrewshire", "Renfrewshire") | |
uk_areas <- c("Edinburgh", "Leeds", "Glasgow", "Sheffield", "Newcastle", "Nottingham", "Liverpool", "Birmingham", | |
"Manchester", "Bristol") | |
all_areas <- unique(c(scot_areas, gcr_areas, uk_areas, "Scotland", "UK")) | |
#### GET STATS19 DATA #### | |
years <- 2014:2017 | |
# download all accident and casualty files to temp directory, to be read in and combined in next step | |
# note: some years will download as a collection, e.g. downloading 1997 will download a 1979--2004 combined file | |
# note: only 2014 onward has age of casualty, which we need | |
# note: at time of writing (Sept 2020) 2017 is latest full year of validated data | |
for(year in years){ | |
dl_stats19(year = year, type = "Casualties", ask = FALSE) | |
dl_stats19(year = year, type = "Accidents", ask = FALSE) | |
} | |
# combine data from different years, format, and select incidents from our areas of interest only | |
for(year in years){ | |
if(year == years[1]){df <- tibble()} | |
df_to_append <- read_casualties(year) %>% | |
left_join(read_accidents(year) %>% select(accident_index, local_authority_district, latitude, longitude)) %>% | |
select(age_of_casualty, casualty_type, casualty_severity, local_authority_district, latitude, longitude) %>% | |
mutate(year = year) %>% | |
mutate(area = recode(local_authority_district, !!!area_recodes)) %>% | |
filter(area %in% c(scot_areas, gcr_areas)) %>% | |
mutate(age_group = case_when(age_of_casualty >= 16 ~ "adult", | |
age_of_casualty >= 5 & age_of_casualty <= 15 ~ "child", | |
TRUE ~ "<5")) %>% | |
mutate(casualty_type = case_when(casualty_type == "Pedestrian" ~ "Pedestrian", | |
casualty_type == "Cyclist" ~ "Cyclist", | |
TRUE ~ "Other")) | |
df <- df %>% bind_rows(df_to_append) | |
if(year == years[length(years)]){ | |
write_rds(df, "stats19.rds") | |
rm(df_to_append)} | |
} | |
df <- read_rds("stats19.rds") | |
#### GLASGOW SIMD ANALYSIS #### | |
# To look at deprivation for Glasgow we need to map each accident's lat/long onto a datazone. | |
# datazone shapefile | |
# https://spatialdata.gov.scot/geonetwork/srv/eng/catalog.search;jsessionid=7BFD03DFFDA5CD66CC065C75FBAD2172#/metadata/389787c0-697d-4824-9ca9-9ce8cb79d6f5 | |
scot_map_2011 <- st_read("SG_DataZoneBdry_2011") %>% | |
st_transform(crs="+proj=longlat +datum=WGS84") %>% | |
ms_simplify(., drop_null_geometries = TRUE, keep = 1) %>% | |
select(DataZone, geometry) %>% | |
rename(datazone_2011 = DataZone) | |
# dataframe of just Glasgow's accidents | |
df_gla <- df %>% | |
filter(area == "Glasgow") %>% | |
filter(is.na(latitude) == FALSE, is.na(longitude) == FALSE) | |
# lat/long coords as spatial object points | |
gla_accident_coords <- st_as_sf(df_gla %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84") | |
# construct vector of datazones which the points intersect. Will be one datazone per point (or none ... see next step) | |
dz_intersects <- st_intersects(gla_accident_coords, scot_map_2011, sparse = TRUE, prepared = TRUE) | |
dz_intersects <-dz_intersects %>% sapply(function(x) if (length(x) == 0) NA_integer_ else x ) # convert to vector with NAs if point not in map | |
# add datazone IDs into Glasgow data | |
df_gla<- df_gla %>% | |
mutate(datazone_2011 = scot_map_2011[dz_intersects,]$datazone_2011) | |
# deal with accidents that don't appear in the datazone geographies; these will occur over water or be outwith Glasgow | |
# Will choose nearest datazone and re-insert these back in, then filter out non-Glasgow datazones | |
df_gla_missing <- df_gla %>% filter(is.na(datazone_2011) == TRUE) | |
df_gla <- df_gla %>% filter(is.na(datazone_2011) == FALSE) # remove these from main data | |
gla_missing_coords <- st_as_sf(df_gla_missing %>% select(latitude, longitude), coords = c("longitude","latitude"), crs = "+proj=longlat +datum=WGS84") | |
gla_missing_datazones <- foreach(i = 1:dim(gla_missing_coords)[1]) %dopar% { | |
distances_to_dzs <- st_distance(gla_missing_coords[i,], scot_map_2011$geometry, by_element = TRUE) | |
scot_map_2011[which(distances_to_dzs == min(distances_to_dzs)),]$datazone_2011 | |
} | |
df_gla_missing$datazone_2011 <- gla_missing_datazones %>% unlist() | |
# add those that had missing datazones back in | |
df_gla <- df_gla %>% | |
bind_rows(df_gla_missing) | |
gla_dz_simd <- dz_ids %>% | |
mutate(area = recode(CAName, !!!area_recodes)) %>% | |
filter(area == "Glasgow") %>% | |
select(DataZone) %>% | |
left_join(simd) %>% | |
select(DataZone, SIMD2016CountryQuintile) %>% | |
rename(datazone_2011 = DataZone, simd = SIMD2016CountryQuintile) | |
df_gla <- df_gla %>% inner_join(gla_dz_simd) # drops those that have non-Glasgow datazones | |
write_rds(df_gla, "stats19_gla.rds") | |
df_gla <- read_rds("stats19_gla.rds") | |
#### PLOTS #### | |
# Populations lookups for calculating rates per 100k pop | |
ca_codes <- ca_ids %>% | |
mutate(area = recode(CAName, !!!area_recodes)) %>% | |
filter(area %in% c(scot_areas, gcr_areas)) %>% | |
filter_at(vars(contains("Archived")), ~is.na(.)) %>% | |
select(CA, area) | |
ca_populations <- ca_pop %>% | |
rename(year = Year) %>% | |
filter(year %in% years) %>% | |
filter(CA %in% ca_codes$CA, | |
Sex == "All") %>% | |
mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(), | |
age_child = select(., Age5:Age15) %>% rowSums(), | |
age_adult = select(., Age16:Age90plus) %>% rowSums()) %>% | |
left_join(ca_codes) %>% | |
rename(age_all = AllAges) %>% | |
select(year, area, contains("age_")) %>% | |
pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group") | |
gla_simd_populations <- dz_pop %>% | |
rename(datazone_2011 = DataZone, | |
year = Year) %>% | |
filter(year %in% years, | |
Sex == "All") %>% | |
inner_join(gla_dz_simd) %>% | |
mutate(`age_<5` = select(., Age0:Age4) %>% rowSums(), | |
age_child = select(., Age5:Age15) %>% rowSums(), | |
age_adult = select(., Age16:Age90plus) %>% rowSums()) %>% | |
rename(age_all = AllAges) %>% | |
select(year, contains("age_"), simd) %>% | |
pivot_longer(cols = contains("age_"), values_to = "pop", names_prefix = "age_", names_to = "age_group") %>% | |
group_by(year, simd, age_group) %>% | |
summarise(pop = sum(pop)) %>% | |
mutate(area = "Glasgow") | |
# LA level adult / child casualties rate | |
df <- read_rds("stats19.rds") | |
df %>% | |
group_by(area, year, age_group) %>% | |
summarise(casualties = n()) %>% | |
left_join(ca_populations) %>% View() | |
mutate(rate = 1e5 * casualties / pop) %>% | |
filter(age_group %in% c("adult", "child")) %>% | |
ggplot(aes(x = year, y = rate, colour = area)) + | |
geom_line() + geom_point() + | |
facet_wrap(~age_group) + | |
theme_bw() | |
plotly::ggplotly() | |
# Glasgow severity | |
df_gla <- read_rds("stats19_gla.rds") | |
df_gla %>% | |
group_by(year) %>% | |
summarise(casualties_all = n(), area = first(area)) %>% | |
left_join( | |
df_gla %>% | |
filter(casualty_severity != "Slight") %>% | |
group_by(year) %>% | |
summarise(casualties_KSI = n()), area = first(area)) %>% | |
pivot_longer(contains("casualties_"), values_to = "casualties", names_prefix = "casualties_", names_to = "type") %>% | |
left_join(ca_populations %>% filter(age_group == "all")) %>% | |
mutate(rate = 1e5 * casualties / pop) %>% | |
ggplot(aes(x = year, y = rate, colour = type)) + | |
geom_line() + geom_point() + | |
theme_bw() | |
plotly::ggplotly() | |
# Glasgow by mode | |
df_gla <- read_rds("stats19_gla.rds") | |
df_gla %>% | |
filter(age_group %in% c("adult", "child")) %>% | |
group_by(year, casualty_type, age_group) %>% | |
summarise(casualties = n(), area = first(area)) %>% | |
left_join(ca_populations %>% filter(age_group %in% c("adult", "child"))) %>% | |
mutate(rate = 1e5 * casualties / pop) %>% | |
ggplot(aes(x = year, y = rate, colour = casualty_type)) + | |
geom_line() + geom_point() + | |
facet_wrap(~age_group) + | |
theme_bw() | |
plotly::ggplotly() | |
# Glasgow by deprivation | |
df_gla %>% | |
filter(age_group %in% c("adult", "child")) %>% | |
group_by(year, simd, age_group) %>% | |
summarise(casualties = n()) %>% | |
left_join(gla_simd_populations) %>% | |
mutate(rate = 1e5 * casualties / pop) %>% | |
mutate(simd = as.factor(simd)) %>% | |
ggplot(aes(x = year, y = rate, colour = simd)) + | |
geom_line() + geom_point() + | |
facet_wrap(~age_group) + | |
theme_bw() | |
plotly::ggplotly() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment