Last active
October 31, 2019 03:49
-
-
Save Tadge-Analytics/1c4dfef73b8fa9cc424be0d7028bdb3a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) | |
library(tidyverse) | |
# data can be downloaded from: | |
# https://www.dropbox.com/s/odg6udrhahsqhiz/SydneyFuelPriceHistory.zip?dl=1 | |
first_changes <- read_csv("data/SydneyFuelPriceHistory.csv") %>% | |
mutate(hour_of_day = lubridate::hour(PriceUpdatedDate) + 1, | |
date = as.Date(PriceUpdatedDate), | |
date = if_else(hour_of_day == 24, date + 1, date), | |
hour_of_day = if_else(hour_of_day == 24, 0, hour_of_day)) %>% | |
# if there are any multi-updates per hour, we only want to take the latest one, for that hour | |
arrange(desc(PriceUpdatedDate)) %>% | |
distinct(ServiceStationName, Address, Suburb, Postcode, FuelCode, date, hour_of_day, .keep_all = T) %>% | |
select(-PriceUpdatedDate, -RecordID) %>% | |
# we're only really going to care about the most recent year... | |
# the past is the past, right? | |
filter(lubridate::year(date) == 2018) | |
# we now need to get some unique identifiers for the stations | |
# turns out the same addresses are sometimes entered differently in our data | |
unique_stations <- first_changes %>% | |
arrange(ServiceStationName) %>% | |
distinct(ServiceStationName, Address, Suburb, Postcode, Brand) %>% | |
mutate(new_ServiceStationName = trimws(tolower(ServiceStationName)), | |
new_ServiceStationName = str_replace(new_ServiceStationName, " ", " "), | |
new_ServiceStationName = str_replace(new_ServiceStationName, " - ", "-"), | |
new_ServiceStationName = str_replace(new_ServiceStationName, "- ", "-")) %>% | |
mutate(new_suburb = trimws(tolower(Suburb))) %>% | |
mutate(new_address = trimws(tolower(Address)), | |
new_address = str_replace(new_address, "rd,", "road,"), | |
new_address = str_replace(new_address, "st,", "street,"), | |
new_address = str_replace(new_address, "hwy,", "highway,"), | |
new_address = str_replace(new_address, " ", " "), | |
new_address = str_replace(new_address, "- ", "-"), | |
new_address = str_replace(new_address, " - ", "-")) %>% | |
mutate(full_name = paste(new_ServiceStationName, new_suburb, Postcode, Brand, sep = "|")) | |
# At this point, I actually used some nifty "cyclic recoding" techinque, | |
# to detect whether there were any duplicates left over, after the above corrections... | |
# There were initially... but not after I filtered for dates >=2018. | |
# Never the less, you can see how I did this at the following gist. | |
# https://gist.github.com/Tadge-Analytics/9ce7f6f5333339cd48c1ad4278a08905 | |
# fuel_prices_by_day <- | |
first_changes %>% | |
left_join(unique_stations %>% select(-contains("new")), | |
by = c("ServiceStationName", "Address", "Suburb", "Postcode", "Brand")) %>% | |
select(full_name, date, hour_of_day, FuelCode, Price) %>% | |
right_join( | |
crossing(full_name = unique(unique_stations$full_name), | |
date = seq.Date(min(first_changes$date), max(first_changes$date), 1), | |
hour_of_day = 0:23, | |
FuelCode = unique(first_changes$FuelCode)) | |
, by = c("full_name", "date", "hour_of_day", "FuelCode")) %>% | |
group_by(full_name, FuelCode) %>% | |
fill(Price) %>% | |
filter(!is.na(Price)) %>% | |
group_by(full_name, date, FuelCode) %>% | |
summarise(daily_mean_price = mean(Price)) %>% | |
spread(FuelCode, daily_mean_price) %>% | |
write_csv("fuel_prices_by_day.csv", na = "") | |
# unique_stations_tidy_names <- | |
unique_stations %>% | |
select(full_name, Postcode, Brand, contains("new")) %>% | |
distinct(full_name, .keep_all = T) %>% | |
rename_at(.vars = vars(contains("new")), | |
.funs = funs(sub("new_", "", .))) %>% | |
write_csv("unique_stations_neat_parts.csv", na = "") | |
# let's get the lat longs | |
# save yourself some time (and using all your google cloud credits) | |
# by downloading the lat longs, here | |
# https://www.dropbox.com/s/zf4jcrqm9ayw4cp/googled%20addresses.csv?dl=1 | |
library(ggmap) | |
register_google(key = "YoUWiLLNeedToUSEUrOWnKey") | |
# bring in those that have already been searched... | |
googled_addresses <- read_csv("googled addresses.csv") | |
# only google those that don't appear in our existing collection | |
tibble(address = setdiff(unique(unique_stations$new_address), googled_addresses$address)) %>% | |
head(5) %>% # a safety measure -comment out when running. | |
mutate(lat_long_data = map(address, ~geocode(.x, output = "latlona", source = "google"))) %>% | |
unnest(lat_long_data) %>% | |
bind_rows(googled_addresses) %>% | |
write_csv("googled addresses.csv", na = "") | |
# valid_googled addresses <- | |
read_csv("googled addresses.csv") %>% | |
filter(!is.na(address1)) %>% | |
mutate(distance_from_median_point = geosphere::distHaversine(cbind(lon, lat), cbind(median(lon), median(lat)))) %>% | |
filter(distance_from_median_point < 1.1E6) %>% | |
select(-distance_from_median_point) %>% | |
distinct(address, .keep_all = T) %>% | |
write_csv("valid_googled addresses.csv", na = "") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment