Skip to content

Instantly share code, notes, and snippets.

@Tadge-Analytics
Last active November 4, 2019 04:13
Show Gist options
  • Save Tadge-Analytics/6d7fb69d8686de7f9673cdfefbf1b22c to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/6d7fb69d8686de7f9673cdfefbf1b22c to your computer and use it in GitHub Desktop.
# first, run the gist at:
# https://gist.github.com/Tadge-Analytics/63f75895f0b2e76dae60c1bd5ce8fec7
library(tidyverse)
vcglr_data <- readRDS("rds files/vcglr_data.rds")
# same location for start and end
# note there cannot be a Licence Num (in the VCGLR) that is 1 or 2
# set time at start and end as 25 minutes
start_and_end_locations <- tibble(`Licence Num` = c(1L,2L),
Latitude = c(-37.8108, -37.8108),
Longitude = c(144.9653, 144.9653),
ideal_time_hrs = 0.25)
# we want a sample.. but for aesthetic reasons...
# let's get equal portions near, not-so-near and far... from QV
set.seed(123)
sample_of_sites <- vcglr_data %>%
mutate(distance_m = geosphere::distHaversine(cbind(Longitude, Latitude),
cbind(start_and_end_locations[[1, "Longitude"]],
start_and_end_locations[[1, "Latitude"]])),
dist_groupings = cut(distance_m, 3, labels = c("low","med","high"))) %>%
filter(dist_groupings != "high") %>%
group_by(dist_groupings) %>%
sample_n(15) %>%
pull(`Licence Num`)
# let's prepare the useful info for our sampled sites
selected_sites_details <- vcglr_data %>%
select(`Licence Num`, Latitude, Longitude) %>%
filter(`Licence Num` %in% sample_of_sites) %>%
# let's give some fake ideal times for spending at these desitinations
# fake popularity and fake costs, also
mutate(ideal_time_hrs = sample(seq(1, 4, 0.5), n(), replace = T),
dest_popularity = runif(n(), 1, 5),
dest_cost = runif(n(), 25, 150)) %>%
bind_rows(start_and_end_locations)
pre_dest_combos <- selected_sites_details %>%
select(`Licence Num`, Latitude, Longitude) %>%
mutate(location = paste(Latitude, Longitude, sep = "+"))
dest_combos <- pre_dest_combos %>%
select(-`Licence Num`) %>%
mutate(data = list(rename_all(., function(x) paste0(x, "1")))) %>%
unnest(data) %>%
filter(location != location1)
# let's get the travel times (rather than distance) between all of these...
# using the google maps api,
library(gmapsdistance)
gmapsdistance::set.api.key("YoUWiLLNeedToUSEUrOWnKey")
# let's get onto googling these travel times!
# but we don't need to find the travel time of the same trip, both ways
# let's also filter out the travel times we've already googled, on previous occaisions
googled_times <- read_rds("rds files/travel_times.rds")
unique_trips <- dest_combos %>%
distinct(location, location1, .keep_all = T) %>%
# let's filter out the travel times we've already googled
anti_join(googled_times, by = c("location" = "1", "location1" = "2")) %>%
# let's remove those "same trips both ways" duplicates
rowid_to_column() %>%
select(rowid, location, location1) %>%
gather(key, value, -rowid) %>%
arrange(value) %>%
select(-key) %>%
group_by(rowid) %>%
mutate(rank = row_number()) %>%
spread(rank, value) %>%
ungroup() %>%
distinct(`1`, `2`, .keep_all = T)
unique_trips %>%
head(5) %>% # this is a safety measure
mutate(travel_time = map2_dbl(`1`, `2`,
~gmapsdistance(origin = .x,
destination = .y,
mode = "driving") %>%
first())) %>%
bind_rows(rename(., `2` =`1`,
`1` = `2`)) %>%
select(-rowid) %>%
bind_rows(googled_times) %>%
write_rds("rds files/travel_times.rds")
# 10 rows took 5.93 seconds
# therefore 465 rows will take 46.5*5.93 == 4.6 minutes (plus cost $$$?)
googled_times <- read_rds("rds files/travel_times.rds")
all_trips_with_travel_times <- dest_combos %>%
inner_join(googled_times, by = c("location" = "1", "location1" = "2"))
# for each unique combination of destinations...
# what is the travel path that will be taken?
# this is purely so that the map displayed will be accurate... instead of just a straight line...
# this could be a grovy set action... so that we see the full path if we click on a path...
library(googleway)
set_key("YoUWiLLNeedToUSEUrOWnKey")
googled_paths <- read_rds("rds files/googled_paths.rds")
unique_trip_paths <- dest_combos %>%
anti_join(googled_paths, by = c("location" = "1", "location1" = "2"))
# x <-
unique_trip_paths %>%
# head(5) %>% # safety measure
rename(`1` = location,
`2` = location1) %>%
mutate(some_list_col = map2(`1`, `2`, ~google_directions(origin = .x,
destination = .y,
mode = "driving") %>%
.$routes%>%
.$legs %>%
.[[1]] %>%
.$steps %>%
.[[1]])) %>%
bind_rows(googled_paths) %>%
write_rds("rds files/googled_paths.rds")
# how long does this one take???
googled_paths <- read_rds("rds files/googled_paths.rds") %>%
mutate(path_id = row_number(),
some_list_col = map(some_list_col, ~do.call(data.frame, c(.x, stringsAsFactors = FALSE)))) %>%
unnest(cols = c(some_list_col)) %>%
select(-distance.text, -distance.value, -duration.text, -duration.value, -html_instructions, -points, -travel_mode, -maneuver) %>%
group_by(path_id) %>%
mutate(leg_id = row_number())
zero_leg <- googled_paths %>%
distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
mutate(leg_id = 0,
end_location.lat = start_location.lat,
end_location.lng = start_location.lng,
start_location.lat = Latitude,
start_location.lng = Longitude)
last_leg <- googled_paths %>%
arrange(desc(leg_id)) %>%
distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs
mutate(leg_id = leg_id + 1,
start_location.lat = end_location.lat,
start_location.lng = end_location.lng,
end_location.lat = Latitude1,
end_location.lat = Longitude1)
googled_paths %>%
bind_rows(zero_leg,
last_leg) %>%
ungroup() %>%
inner_join(
pre_dest_combos %>% select(`Licence Num`, location),
by = c("1" = "location")) %>%
inner_join(
pre_dest_combos %>% select(`Licence Num`, location),
by = c("2" = "location")) %>%
select(`Licence Num.x`, `Licence Num.y`, leg_id, start_location.lat, start_location.lng, end_location.lat, end_location.lng) %>%
write_csv("output csvs/googled_paths.csv", na = "")
# from here?
# let's work out all possible combinations of destinations, for trips with N number of stops
# i <- 1
for (i in 1:4) {
# let's create a list of possible tours
number_of_stops <- i
tictoc::tic()
all_combos <- as_tibble(
expand.grid(rep(list(sample_of_sites), number_of_stops))) %>%
mutate(tour_id = row_number()) %>%
gather(key, value, -tour_id) %>%
mutate(key = as.integer(str_remove(key, "Var"))) %>%
group_by(tour_id) %>%
filter(n_distinct(value) == number_of_stops) %>% # only keep the tours with unique stops (no point going to the same place twice in the one tour, right?)
bind_rows(., distinct(., tour_id) %>% # add in start and end locations
crossing(tibble(key = c(0L, number_of_stops + 1L),
value = c(1L, 2L)))) %>%
arrange(key) %>%
mutate(prev_dest = lag(value)) %>%
left_join(selected_sites_details, by = c("value" = "Licence Num")) %>%
left_join(selected_sites_details %>% select(-ideal_time_hrs, -dest_cost, -dest_popularity), by = c("prev_dest" = "Licence Num")) %>%
left_join(all_trips_with_travel_times %>%
distinct() %>%
select(-location, -location1), by = c("Latitude.x" = "Latitude",
"Longitude.x" = "Longitude",
"Latitude.y" = "Latitude1",
"Longitude.y" = "Longitude1"))
print(tictoc::toc())
tours_with_calc <- all_combos %>%
group_by(tour_id) %>%
summarise(time_at_dests = sum(ideal_time_hrs),
travel_time = sum(travel_time, na.rm = T)/3600,
total_tour_time = time_at_dests + travel_time,
transit_percentage = travel_time/total_tour_time,
total_cost = sum(dest_cost, na.rm = T),
avg_popularity = mean(dest_popularity, na.rm = T))
tours_with_calc %>%
ggplot() +
aes(total_tour_time) +
geom_histogram()
# do any viable tours contain the very same stops?
# keep only the one with the shortest total travel time.
fastest_unique <- all_combos %>%
select(tour_id, value) %>%
arrange(value) %>%
group_by(tour_id) %>%
mutate(arranged_order = row_number()) %>%
spread(arranged_order, value) %>%
group_by_at(vars(-tour_id)) %>%
left_join(tours_with_calc %>% select(tour_id, total_tour_time), by = "tour_id") %>%
arrange(total_tour_time) %>%
group_by_at(vars(-tour_id, -total_tour_time, -`1`, -`2`)) %>% # only keep the tour with the least time (or, if they're the same, keep the first)
slice(1)
# which are the viable tours?
viable_tours <- tours_with_calc %>%
semi_join(fastest_unique, by = "tour_id") %>%
# bring these filters back...
# after you've fixed the destinations sampling to avoid such far away places...
# filter(total_tour_time < 9) %>%
# filter(total_tour_time > 2.5) %>%
# filter(transit_percentage < 0.3) %>%
mutate(number_of_stops = i)
# ideally you would also add in a "contains these types" column...
# but this might be better at the dashbaord stage
filtered_stops <- all_combos %>%
semi_join(viable_tours, by = "tour_id") %>%
select(-contains(".y"), -prev_dest) %>%
rename(Latitude = Latitude.x,
Longitude = Longitude.x,
stop_number = key,
dest_id = value) %>%
left_join(vcglr_data %>% mutate(`Licence Num` = as.integer(`Licence Num`)) %>% select(`Licence Num`, `Trading As`, Category), by = c("dest_id" = "Licence Num"))
# save files to disk
write_csv(viable_tours, paste0("output csvs/viable_tours_", i, ".csv"), na = "")
write_csv(filtered_stops, paste0("output csvs/filtered_stops_", i, ".csv"), na = "")
rm(viable_tours, filtered_stops)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment