Last active
August 7, 2021 07:36
-
-
Save northernjamie/3134fccbe6d352386c13558c2801c995 to your computer and use it in GitHub Desktop.
Thie is an R script that gets real time bus data from DfT, and makes a series of maps and saves as images for making a gif.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(leaflet) | |
library(httr) | |
library(sp) | |
library(rgdal) | |
library(mapview) | |
library(xml2) | |
library(RColorBrewer) | |
####### | |
# This function calls the real time bus api a set number of times, with a set interval, | |
# and generates a series of images that can be stitched together to make a gif | |
# the gif-making is done in python | |
####### | |
####### | |
# TODO: Get the server up and running for a 24 hour job | |
# TODO: Differentiate between inbound/outbound on the map | |
# TODO: Put the API key into a secret file so I can get this into a public repo on GH | |
# TODO: Parameterise the duration and interval | |
####### | |
getBusData <- function() { | |
api_key <- "put your api key here" # API key from DfT | |
bounding_box <- "-2.74,53.3,-1.9,53.7" # Bounding box for GM | |
gm <- readOGR("inputs/gm.geojson") # Geojson for GM for further points clipping | |
gtfs_operators <- read_csv("inputs/itm_all_gtfs/agency.txt") # get the bus operators from DfT | |
i <- 1 | |
for (i in 1:60) { | |
startTime <- Sys.time() # so that we can make the calls exactly 60 seconds apart (or whatever time interval we want) | |
doc <- read_xml(paste0("https://data.bus-data.dft.gov.uk/api/v1/datafeed?api_key=621e077f86267d348add17c93a0e31f839dc013a&boundingBox=",bounding_box)) | |
# here, we are using the xml2 package to get specific nodes from the xml file | |
# the problem is that not all buses have all information. Most suggestions for parsing xml in R | |
# suggest generating lists, and binding them together, but where nodes don't exist for a particular journey | |
# record counts are not equal. This approach generates a df for each node that we want, consisting of | |
# the node and the unique id for the bus journey. If a node doesn't exist, then 'NA' is inserted. | |
# dplyr is used to join them all up at the end. | |
df_line_ref <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
line_refs <- xml_find_all(x, ".//d1:LineRef") | |
if (length(line_refs) == 0) { | |
data.frame(id,line=NA) | |
} else { | |
map_df(line_refs, function(y) { | |
line <- xml_text(y) %||% NA | |
data.frame(id,line) | |
}) | |
} | |
}) | |
df_long <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
longitude <- xml_find_all(x, ".//d1:Longitude") | |
if (length(longitude) == 0) { | |
data.frame(id,long=NA) | |
} else { | |
map_df(longitude, function(y) { | |
long <- xml_text(y) %||% NA | |
data.frame(id,long) | |
}) | |
} | |
}) | |
df_lat <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
latitude <- xml_find_all(x, ".//d1:Latitude") | |
if (length(latitude) == 0) { | |
data.frame(id,lat=NA) | |
} else { | |
map_df(latitude, function(y) { | |
lat <- xml_text(y) %||% NA | |
data.frame(id,lat) | |
}) | |
} | |
}) | |
df_bearing <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
bearings <- xml_find_all(x, ".//d1:Bearing") | |
if (length(bearings) == 0) { | |
data.frame(id,bearing=NA) | |
} else { | |
map_df(bearings, function(y) { | |
bearing <- xml_text(y) %||% NA | |
data.frame(id,bearing) | |
}) | |
} | |
}) | |
df_operator <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
operators <- xml_find_all(x, ".//d1:OperatorRef") | |
if (length(operators) == 0) { | |
data.frame(id,operator=NA) | |
} else { | |
map_df(operators, function(y) { | |
operator <- xml_text(y) %||% NA | |
data.frame(id,operator) | |
}) | |
} | |
}) | |
df_direction <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
directions <- xml_find_all(x, ".//d1:DirectionRef") | |
if (length(directions) == 0) { | |
data.frame(id,direction=NA) | |
} else { | |
map_df(directions, function(y) { | |
direction <- xml_text(y) %||% NA | |
data.frame(id,direction) | |
}) | |
} | |
}) | |
df_destination_ref <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
destination_refs <- xml_find_all(x, ".//d1:DestinationRef") | |
if (length(destination_refs) == 0) { | |
data.frame(id,destination_ref=NA) | |
} else { | |
map_df(destination_refs, function(y) { | |
destination_ref <- xml_text(y) %||% NA | |
data.frame(id,destination_ref) | |
}) | |
} | |
}) | |
df_destination_name <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
destination_names <- xml_find_all(x, ".//d1:DestinationName") | |
if (length(destination_names) == 0) { | |
data.frame(id,destination_name=NA) | |
} else { | |
map_df(destination_names, function(y) { | |
destination_name <- xml_text(y) %||% NA | |
data.frame(id,destination_name) | |
}) | |
} | |
}) | |
df_origin_ref <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
origin_refs <- xml_find_all(x, ".//d1:OriginRef") | |
if (length(origin_refs) == 0) { | |
data.frame(id,origin_ref=NA) | |
} else { | |
map_df(origin_refs, function(y) { | |
origin_ref <- xml_text(y) %||% NA | |
data.frame(id,origin_ref) | |
}) | |
} | |
}) | |
df_origin_name <- xml_find_all(doc, ".//d1:VehicleActivity") %>% | |
map_df(function(x) { | |
id <- xml_text(xml_find_first(x,".//d1:ItemIdentifier")) | |
origin_names <- xml_find_all(x, ".//d1:OriginName") | |
if (length(origin_names) == 0) { | |
data.frame(id,origin_name=NA) | |
} else { | |
map_df(origin_names, function(y) { | |
origin_name <- xml_text(y) %||% NA | |
data.frame(id,origin_name) | |
}) | |
} | |
}) | |
df_bus_info <- df_line_ref %>% | |
left_join(df_lat, by = "id") %>% | |
left_join(df_long, by = "id") %>% | |
left_join(df_bearing, by = "id") %>% | |
left_join(df_operator, by = "id") %>% | |
left_join(df_direction, by = "id") %>% | |
left_join(df_destination_ref, by = "id") %>% | |
left_join(df_destination_name, by = "id") %>% | |
left_join(df_origin_ref, by = "id") %>% | |
left_join(df_origin_name, by = "id") %>% | |
left_join(gtfs_operators, by=c("operator" = "agency_noc")) | |
df_bus_info$long <- as.numeric(df_bus_info$long) | |
df_bus_info$lat <- as.numeric(df_bus_info$lat) | |
df_bus_info$bearing <- as.numeric(df_bus_info$bearing) | |
# turn the data into a spatial points dataframe so that we can clip the dataset to greater manchester | |
coordinates(df_bus_info) <- c("long","lat") | |
as(df_bus_info,"SpatialPoints") | |
proj4string(df_bus_info) <- CRS("+proj=longlat +datum=WGS84") | |
proj4string(df_bus_info) <- proj4string(gm) | |
pointsinpoly <- over(df_bus_info,gm) | |
output <- (cbind(df_bus_info@data,df_bus_info@coords,pointsinpoly$NAME)) %>% | |
filter(!is.na(pointsinpoly$NAME)) | |
# this bit only runs the first iteration of the loop and is used to define the top 12 operators on the first pass | |
# this ensures the legend and colours remain consistent throughout the animation, at the cost of not accounting | |
# for changes to the top12 | |
# there are two top12s - the first gets the top12 companies, to allow us to replace all the others with 'Other' | |
# The second then recalculates the top 12 so that it includes the 'Others' | |
if(i==1) { | |
top12 <- output %>% | |
group_by(agency_name) %>% | |
filter(!is.na(agency_name)) %>% | |
summarise(count_agencies=n()) %>% | |
slice_max(count_agencies,n=12,with_ties=F) | |
output2 <- output %>% | |
mutate(agency_name=ifelse(agency_name %in% top12$agency_name,agency_name,"Other")) | |
top12 <- output %>% | |
group_by(agency_name) %>% | |
filter(!is.na(agency_name)) %>% | |
summarise(count_agencies=n()) %>% | |
slice_max(count_agencies,n=12,with_ties=F) | |
} | |
output2 <- output %>% | |
mutate(agency_name=ifelse(agency_name %in% top12$agency_name,agency_name,"Other")) | |
agency_pal <- colorFactor("Set3",levels=top12$agency_name) | |
bus_map <- leaflet(data = output) %>% | |
addProviderTiles("CartoDB.Positron") %>% | |
addCircleMarkers(~long,~lat,weight=0.5, color="#333333", radius = 5, fillColor = ~agency_pal(agency_name),fillOpacity=0.5,opacity=1) %>% | |
addPolygons(data = gm, weight=1, color="#444444",fillOpacity=0.05, opacity=1) %>% | |
addControl(startTime) %>% | |
addLegend(pal = agency_pal, values = top12$agency_name, opacity = 1) | |
# save the map as a png | |
mapshot(bus_map, file = paste0("outputs/map",i,".png")) | |
i = i+1 | |
# this bit ensures the data is captured every 60 seconds, rather than 60seconds plus computing time | |
sleepTime <- startTime + 60 - Sys.time() | |
if (sleepTime > 0) | |
Sys.sleep(sleepTime) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment