Created
March 16, 2018 03:24
-
-
Save lashlee/a1579c99395a5589d50f9683dd424755 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(ggmap) | |
#Test number must be of the form of a number (ndx) followed by a single letter (version). | |
is_valid_test_nbr <- function(test_nbr) grepl("^[0-9]+[A-Za-z]?{1}$",test_nbr) | |
##Test | |
identical(is_valid_test_nbr(c("A","123","123A","123AB","_123A","_123","_A")), | |
c(FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE)) | |
get_test_nbr_ndx <- function(test_nbr) { | |
if (all(is_valid_test_nbr(test_nbr))) { | |
as.integer(gsub("[A-Za-z]","",test_nbr)) | |
} else { | |
stop("At least one test number is not valid.") | |
} | |
} | |
get_test_nbr_version <- function(test_nbr) { | |
if (all(is_valid_test_nbr(test_nbr))) { | |
gsub("[0-9]","",test_nbr) | |
} else { | |
stop("At least one test number is not valid.") | |
} | |
} | |
csv <- read.csv("data.csv",stringsAsFactors=FALSE,na.strings=c(""," ","NA","-")) | |
dat <- csv %>% | |
rename(test_nbr = "Test..No.", | |
hydrant_nbr_pitot = "Pitot.Hydrant.No.", | |
hydrant_nbr_static_residual = "Static.and.Residual.Hydrant.No.", | |
test_dt = "Test.Date", | |
addr = "Approximate.Location", | |
pressure_static_psi = "Static.Press...psi.", | |
pressure_residual_psi = "Residual.Press...psi.", | |
pressure_pitot_psi = "Pitot.Press...psi.", | |
orifice_diameter_in = "Orifice.Diam...in.", | |
discharge_obs_gpm = "Observed.Discharge..gpm.", | |
discharge_est_at_20_psi_gpm = "Est..Discharge..20.psi..gpm.", | |
pitot_tube_ndx = "Pitot.Tube", | |
orifice_diameter_2_in = "Orifice.Diam...in..1", | |
discharge_obs_2_gpm = "Observed.Discharge..gpm..1") %>% | |
filter(test_nbr != "") %>% | |
mutate(test_nbr_ndx = get_test_nbr_ndx(test_nbr), | |
test_nbr_version = get_test_nbr_version(test_nbr), | |
test_dt = as.Date(test_dt,format="%m/%d/%Y"), | |
pitot_tube_ndx = if_else(pitot_tube_ndx=="<NA>",NA_integer_,as.integer(pitot_tube_ndx)), | |
addr = if_else(addr=="Sutro Street and Sean Street","Sutro Street and Dean Street",addr)) %>% | |
mutate(addr_haywarded = paste(addr, "Hayward CA")) %>% | |
mutate_at(vars(discharge_obs_gpm, | |
discharge_est_at_20_psi_gpm, | |
orifice_diameter_2_in, | |
discharge_obs_2_gpm), | |
function(x) {as.numeric(gsub(",","",x))}) %>% | |
mutate_geocode(addr_haywarded) %>% #This is the very time-expensive call to Google for lat,long pairs per address. | |
arrange(test_nbr_ndx,test_nbr_version,test_dt) %>% | |
group_by(test_nbr_ndx) %>% mutate(test_nbr_version_ndx = row_number()) %>% ungroup() %>% | |
# Manual corrections because Google is dumb | |
mutate(lat = if_else(addr_haywarded=="Tripaldi Way and Hesperian Blvd Hayward CA",37.614299,lat), | |
lon = if_else(addr_haywarded=="Tripaldi Way and Hesperian Blvd Hayward CA",-122.086035,lon)) | |
save(dat,file="cleaned_data.RData") | |
write.table(dat,file="fire_flow.tsv",sep="\t",row.names=FALSE) | |
#This is really crap code. | |
dat2 <- dat %>% filter(is.na(lon) | is.na(lat)) | |
missing_geocodes <- geocode(dat2$addr) | |
dat3 <- data.frame(dat2,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1) | |
dat4 <- dat %>% | |
left_join(dat3,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>% | |
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>% | |
select(-c(lat.1,lon.1)) | |
dat5 <- dat4 %>% filter(is.na(lon) | is.na(lat)) | |
missing_geocodes <- geocode(dat5$addr) | |
dat6 <- data.frame(dat5,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1) | |
dat7 <- dat4 %>% | |
left_join(dat6,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>% | |
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>% | |
select(-c(lat.1,lon.1)) | |
dat8 <- dat7 %>% filter(is.na(lon) | is.na(lat)) | |
missing_geocodes <- geocode(dat8$addr) | |
dat9 <- data.frame(dat8,missing_geocodes) %>% select(test_nbr_ndx,test_nbr_version_ndx,lat.1,lon.1) | |
dat10 <- dat7 %>% | |
left_join(dat9,by=c("test_nbr_ndx","test_nbr_version_ndx")) %>% | |
mutate(lat=coalesce(lat,lat.1),lon=coalesce(lon,lon.1)) %>% | |
select(-c(lat.1,lon.1)) | |
dat_nonmissing <- dat10 | |
save(dat_nonmissing,file="nonmissing_cleaned_data.RData") | |
write.table(dat_nonmissing,file="fire_flow.tsv",sep="\t",row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment