Last active
November 4, 2019 04:13
-
-
Save Tadge-Analytics/6d7fb69d8686de7f9673cdfefbf1b22c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# first, run the gist at: | |
# https://gist.github.com/Tadge-Analytics/63f75895f0b2e76dae60c1bd5ce8fec7 | |
library(tidyverse) | |
vcglr_data <- readRDS("rds files/vcglr_data.rds") | |
# same location for start and end | |
# note there cannot be a Licence Num (in the VCGLR) that is 1 or 2 | |
# set time at start and end as 25 minutes | |
start_and_end_locations <- tibble(`Licence Num` = c(1L,2L), | |
Latitude = c(-37.8108, -37.8108), | |
Longitude = c(144.9653, 144.9653), | |
ideal_time_hrs = 0.25) | |
# we want a sample.. but for aesthetic reasons... | |
# let's get equal portions near, not-so-near and far... from QV | |
set.seed(123) | |
sample_of_sites <- vcglr_data %>% | |
mutate(distance_m = geosphere::distHaversine(cbind(Longitude, Latitude), | |
cbind(start_and_end_locations[[1, "Longitude"]], | |
start_and_end_locations[[1, "Latitude"]])), | |
dist_groupings = cut(distance_m, 3, labels = c("low","med","high"))) %>% | |
filter(dist_groupings != "high") %>% | |
group_by(dist_groupings) %>% | |
sample_n(15) %>% | |
pull(`Licence Num`) | |
# let's prepare the useful info for our sampled sites | |
selected_sites_details <- vcglr_data %>% | |
select(`Licence Num`, Latitude, Longitude) %>% | |
filter(`Licence Num` %in% sample_of_sites) %>% | |
# let's give some fake ideal times for spending at these desitinations | |
# fake popularity and fake costs, also | |
mutate(ideal_time_hrs = sample(seq(1, 4, 0.5), n(), replace = T), | |
dest_popularity = runif(n(), 1, 5), | |
dest_cost = runif(n(), 25, 150)) %>% | |
bind_rows(start_and_end_locations) | |
pre_dest_combos <- selected_sites_details %>% | |
select(`Licence Num`, Latitude, Longitude) %>% | |
mutate(location = paste(Latitude, Longitude, sep = "+")) | |
dest_combos <- pre_dest_combos %>% | |
select(-`Licence Num`) %>% | |
mutate(data = list(rename_all(., function(x) paste0(x, "1")))) %>% | |
unnest(data) %>% | |
filter(location != location1) | |
# let's get the travel times (rather than distance) between all of these... | |
# using the google maps api, | |
library(gmapsdistance) | |
gmapsdistance::set.api.key("YoUWiLLNeedToUSEUrOWnKey") | |
# let's get onto googling these travel times! | |
# but we don't need to find the travel time of the same trip, both ways | |
# let's also filter out the travel times we've already googled, on previous occaisions | |
googled_times <- read_rds("rds files/travel_times.rds") | |
unique_trips <- dest_combos %>% | |
distinct(location, location1, .keep_all = T) %>% | |
# let's filter out the travel times we've already googled | |
anti_join(googled_times, by = c("location" = "1", "location1" = "2")) %>% | |
# let's remove those "same trips both ways" duplicates | |
rowid_to_column() %>% | |
select(rowid, location, location1) %>% | |
gather(key, value, -rowid) %>% | |
arrange(value) %>% | |
select(-key) %>% | |
group_by(rowid) %>% | |
mutate(rank = row_number()) %>% | |
spread(rank, value) %>% | |
ungroup() %>% | |
distinct(`1`, `2`, .keep_all = T) | |
unique_trips %>% | |
head(5) %>% # this is a safety measure | |
mutate(travel_time = map2_dbl(`1`, `2`, | |
~gmapsdistance(origin = .x, | |
destination = .y, | |
mode = "driving") %>% | |
first())) %>% | |
bind_rows(rename(., `2` =`1`, | |
`1` = `2`)) %>% | |
select(-rowid) %>% | |
bind_rows(googled_times) %>% | |
write_rds("rds files/travel_times.rds") | |
# 10 rows took 5.93 seconds | |
# therefore 465 rows will take 46.5*5.93 == 4.6 minutes (plus cost $$$?) | |
googled_times <- read_rds("rds files/travel_times.rds") | |
all_trips_with_travel_times <- dest_combos %>% | |
inner_join(googled_times, by = c("location" = "1", "location1" = "2")) | |
# for each unique combination of destinations... | |
# what is the travel path that will be taken? | |
# this is purely so that the map displayed will be accurate... instead of just a straight line... | |
# this could be a grovy set action... so that we see the full path if we click on a path... | |
library(googleway) | |
set_key("YoUWiLLNeedToUSEUrOWnKey") | |
googled_paths <- read_rds("rds files/googled_paths.rds") | |
unique_trip_paths <- dest_combos %>% | |
anti_join(googled_paths, by = c("location" = "1", "location1" = "2")) | |
# x <- | |
unique_trip_paths %>% | |
# head(5) %>% # safety measure | |
rename(`1` = location, | |
`2` = location1) %>% | |
mutate(some_list_col = map2(`1`, `2`, ~google_directions(origin = .x, | |
destination = .y, | |
mode = "driving") %>% | |
.$routes%>% | |
.$legs %>% | |
.[[1]] %>% | |
.$steps %>% | |
.[[1]])) %>% | |
bind_rows(googled_paths) %>% | |
write_rds("rds files/googled_paths.rds") | |
# how long does this one take??? | |
googled_paths <- read_rds("rds files/googled_paths.rds") %>% | |
mutate(path_id = row_number(), | |
some_list_col = map(some_list_col, ~do.call(data.frame, c(.x, stringsAsFactors = FALSE)))) %>% | |
unnest(cols = c(some_list_col)) %>% | |
select(-distance.text, -distance.value, -duration.text, -duration.value, -html_instructions, -points, -travel_mode, -maneuver) %>% | |
group_by(path_id) %>% | |
mutate(leg_id = row_number()) | |
zero_leg <- googled_paths %>% | |
distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs | |
mutate(leg_id = 0, | |
end_location.lat = start_location.lat, | |
end_location.lng = start_location.lng, | |
start_location.lat = Latitude, | |
start_location.lng = Longitude) | |
last_leg <- googled_paths %>% | |
arrange(desc(leg_id)) %>% | |
distinct(path_id, .keep_all = T) %>% # need to add starting positions and ending posiiotns to the path's collection of legs | |
mutate(leg_id = leg_id + 1, | |
start_location.lat = end_location.lat, | |
start_location.lng = end_location.lng, | |
end_location.lat = Latitude1, | |
end_location.lat = Longitude1) | |
googled_paths %>% | |
bind_rows(zero_leg, | |
last_leg) %>% | |
ungroup() %>% | |
inner_join( | |
pre_dest_combos %>% select(`Licence Num`, location), | |
by = c("1" = "location")) %>% | |
inner_join( | |
pre_dest_combos %>% select(`Licence Num`, location), | |
by = c("2" = "location")) %>% | |
select(`Licence Num.x`, `Licence Num.y`, leg_id, start_location.lat, start_location.lng, end_location.lat, end_location.lng) %>% | |
write_csv("output csvs/googled_paths.csv", na = "") | |
# from here? | |
# let's work out all possible combinations of destinations, for trips with N number of stops | |
# i <- 1 | |
for (i in 1:4) { | |
# let's create a list of possible tours | |
number_of_stops <- i | |
tictoc::tic() | |
all_combos <- as_tibble( | |
expand.grid(rep(list(sample_of_sites), number_of_stops))) %>% | |
mutate(tour_id = row_number()) %>% | |
gather(key, value, -tour_id) %>% | |
mutate(key = as.integer(str_remove(key, "Var"))) %>% | |
group_by(tour_id) %>% | |
filter(n_distinct(value) == number_of_stops) %>% # only keep the tours with unique stops (no point going to the same place twice in the one tour, right?) | |
bind_rows(., distinct(., tour_id) %>% # add in start and end locations | |
crossing(tibble(key = c(0L, number_of_stops + 1L), | |
value = c(1L, 2L)))) %>% | |
arrange(key) %>% | |
mutate(prev_dest = lag(value)) %>% | |
left_join(selected_sites_details, by = c("value" = "Licence Num")) %>% | |
left_join(selected_sites_details %>% select(-ideal_time_hrs, -dest_cost, -dest_popularity), by = c("prev_dest" = "Licence Num")) %>% | |
left_join(all_trips_with_travel_times %>% | |
distinct() %>% | |
select(-location, -location1), by = c("Latitude.x" = "Latitude", | |
"Longitude.x" = "Longitude", | |
"Latitude.y" = "Latitude1", | |
"Longitude.y" = "Longitude1")) | |
print(tictoc::toc()) | |
tours_with_calc <- all_combos %>% | |
group_by(tour_id) %>% | |
summarise(time_at_dests = sum(ideal_time_hrs), | |
travel_time = sum(travel_time, na.rm = T)/3600, | |
total_tour_time = time_at_dests + travel_time, | |
transit_percentage = travel_time/total_tour_time, | |
total_cost = sum(dest_cost, na.rm = T), | |
avg_popularity = mean(dest_popularity, na.rm = T)) | |
tours_with_calc %>% | |
ggplot() + | |
aes(total_tour_time) + | |
geom_histogram() | |
# do any viable tours contain the very same stops? | |
# keep only the one with the shortest total travel time. | |
fastest_unique <- all_combos %>% | |
select(tour_id, value) %>% | |
arrange(value) %>% | |
group_by(tour_id) %>% | |
mutate(arranged_order = row_number()) %>% | |
spread(arranged_order, value) %>% | |
group_by_at(vars(-tour_id)) %>% | |
left_join(tours_with_calc %>% select(tour_id, total_tour_time), by = "tour_id") %>% | |
arrange(total_tour_time) %>% | |
group_by_at(vars(-tour_id, -total_tour_time, -`1`, -`2`)) %>% # only keep the tour with the least time (or, if they're the same, keep the first) | |
slice(1) | |
# which are the viable tours? | |
viable_tours <- tours_with_calc %>% | |
semi_join(fastest_unique, by = "tour_id") %>% | |
# bring these filters back... | |
# after you've fixed the destinations sampling to avoid such far away places... | |
# filter(total_tour_time < 9) %>% | |
# filter(total_tour_time > 2.5) %>% | |
# filter(transit_percentage < 0.3) %>% | |
mutate(number_of_stops = i) | |
# ideally you would also add in a "contains these types" column... | |
# but this might be better at the dashbaord stage | |
filtered_stops <- all_combos %>% | |
semi_join(viable_tours, by = "tour_id") %>% | |
select(-contains(".y"), -prev_dest) %>% | |
rename(Latitude = Latitude.x, | |
Longitude = Longitude.x, | |
stop_number = key, | |
dest_id = value) %>% | |
left_join(vcglr_data %>% mutate(`Licence Num` = as.integer(`Licence Num`)) %>% select(`Licence Num`, `Trading As`, Category), by = c("dest_id" = "Licence Num")) | |
# save files to disk | |
write_csv(viable_tours, paste0("output csvs/viable_tours_", i, ".csv"), na = "") | |
write_csv(filtered_stops, paste0("output csvs/filtered_stops_", i, ".csv"), na = "") | |
rm(viable_tours, filtered_stops) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment