Setup.
library(googleway)
library(dplyr)
library(tibble)
DATA_DIR <- "path/to/dir"
dir.create(DATA_DIR, showWarnings = FALSE, recursive = TRUE)
API_KEY <- Sys.getenv("GOOGLE_PLACES_API_KEY")
raw_my_location_pairs <- read.csv(file.path(DATA_DIR, "google-buffer-coordinates.csv"))
my_location_pairs <- lapply(
1:nrow(raw_my_location_pairs),
function(i) {
c(
raw_my_location_pairs[i, "Y"],
raw_my_location_pairs[i, "X"]
)
}
)
my_radius <- 1557 ## Radius in meters
my_types <- c("grocery_or_supermarket", "convenience_store")
Ugly loops so it's easier to understand for non-purrr
people.
## tibble instead of just data.frame cuz 0-row data.frames have dumb behavior
places <- tibble::tibble()
for(j in seq_along(my_location_pairs)) {
location_pair <- my_location_pairs[[j]]
for (type in my_types) {
last_page_token <- NULL
keep_going <- TRUE
i <- 1
while (isTRUE(keep_going)) {
## save data so we don"t have to re-scrape everything if something breaks in the middle of the run.
hash <- paste0(sprintf('%03.0f', j), "-", type, "-", i)
cached_file_path <- file.path(DATA_DIR, paste0(hash, ".rds"))
if (file.exists(cached_file_path)) {
res <- readRDS(cached_file_path)
} else {
message(paste0("Loop ", i, " for type ", type, " and location pair ", j, "."))
## Randomly wait between 3 to 5 seconds to prevent rate limiting.
Sys.sleep(runif(1, 3, 5))
res <- googleway::google_places(
location = location_pair,
radius = my_radius,
place_type = type,
key = API_KEY,
page_token = last_page_token
)
saveRDS(res, cached_file_path)
}
if (res$status == 'ZERO_RESULTS') {
keep_going <- FALSE
} else if (nrow(res$results) < 20) {
keep_going <- FALSE
}
i <- i + 1
res$results$OID_ <- j
res$results$type <- type
places <- dplyr::bind_rows(places, res$results)
## check to see when we have at least 2 of every location, then end the loop
## next_page_token will never not be null, so if we don't end the loop ourselves
## it will keep going forever
place_counts <- places |>
dplyr::filter(type == .env$type, OID_ == j) |>
dplyr::count(place_id)
n_dups <- place_counts |>
dplyr::filter(n >= 2) |>
nrow()
if (n_dups == nrow(place_counts)) {
keep_going <- FALSE
}
last_page_token <- res$next_page_token
}
}
}
Clean up.
distinct_places <- places |>
dplyr::distinct(
place_id,
.keep_all = TRUE
) |>
dplyr::left_join(
raw_my_location_pairs |>
dplyr::select(
OID_,
X,
Y
),
by = join_by(OID_)
) |>
tidyr::unnest_wider(geometry) |>
tidyr::unnest_wider(location) |>
dplyr::transmute(
OID_,
X,
Y,
lng,
lat,
type,
place_id,
name,
vicinity,
price_level,
rating,
user_ratings_total,
all_api_types = purrr::map_chr(types, \(.x) paste0(sort(unique(.x)), collapse = ',')),
dplyr::across(all_api_types, \(.x) ifelse(.x == '', NA_character_, .x))
)
readr::write_csv(distinct_places, file.path(DATA_DIR, "all_places.csv"), na = "")
distinct_places |>
dplyr::select(
place_id,
all_api_types
) |>
tidyr::separate_rows(all_api_types) |>
dplyr::distinct(
place_id,
all_api_types
) |>
dplyr::mutate(value = 1L) |>
tidyr::pivot_wider(
names_from = all_api_types,
values_from = value,
values_fill = 0L
) |>
readr::write_csv(file.path(DATA_DIR, "all_api_types.csv"), na = "")