troyhill · October 12, 2018 17:15
diff --git a/dbhydro_functions.R b/dbhydro_functions.R
 #!/usr/bin/Rscript
 ### functions to download DBHYDRO station and project data
 # setwd("/home/thill/RDATA/dbhydro/data/20180419/")

 library(httr)

 # command line arguments: order matters (right now) 
 # arguments must be in order: [project/station name] [working_dir] [reportType: station or project]
 name        <- gsub("[^[:alnum:] ]", "", commandArgs()[6]) 
 working_dir <- sapply(strsplit(commandArgs()[7], split = "\\["), "[", 1)
 reportType  <- commandArgs()[8]

 cat("item name: ", name)
 cat("working directory for R: ", working_dir)
 cat("report type: ", reportType)

 # setwd(args[7])


 dbh_stn_data <- function(destfile = "stn_report_todaysDate.csv",
                         stations = c("ACRA1"), # If one station in the list returns no data, the url request fails.
                         rename_proj = TRUE,
                         parameters = "all",
                         report_type = "full", # full = long dataset - 1 line per sample; crosstab = wide dataset
                         incl_qc_flags = TRUE,
                         incl_flagged_data = FALSE,
                         destination = "file_csv", 
                         start_date = "01-JAN-1960", # note format
                         end_date   = "today",
                         import_data = FALSE) {
  
  if (rename_proj == TRUE) {
    destfile <- gsub(pattern = "stn", replacement = gsub(pattern = "'", replace = "", x = stations), x = destfile)
  }
  if (report_type == "full") {
    report_type_text <- paste0("&v_report=ctr_1_true")
  } else if (report_type == "crosstab") {
    report_type_text <- paste0("&v_report=ctr_w")
  } else stop("invalid input for 'report_type': must be 'either 'full' or 'crosstab'")
  
  if (incl_qc_flags == TRUE) {
    qc_flags_text <- paste0("&v_exc_qc=N")
  } else if (incl_qc_flags == FALSE) {
    qc_flags_text <- paste0("&v_exc_qc=Y")
  } else stop("invalid input for 'incl_qc_flags': must be TRUE/FALSE")
  
  if (incl_flagged_data == FALSE) {
    flagged_data_text <- paste0("&v_exc_flagged=Y")
  } else if (incl_flagged_data == TRUE) {
    flagged_data_text <- paste0("&v_exc_flagged=N")
  } else stop("invalid input for 'incl_flagged_data': must be TRUE/FALSE")
  
  if (!destination == "file_csv") {
    stop("Currently only csv output is supported. Use argument 'destination = 'file_csv'")
  }
  
  if (end_date == "today") {
    end_date <- toupper(format(Sys.Date(), "%d-%b-%Y"))
  }
  
  if (grepl("todaysDate", destfile)) {
    destfile <- gsub(pattern = "todaysDate", replacement = format(Sys.Date(), "%Y%m%d"), x = destfile)
  }
  
  
  if (grepl("%25", stations)) { # if wild cards are used in station references, url must use "like"
    stationRef <- "'%20and%20station_id%20like%20('"
  } else { # if no wild cards are used, url must use "in"
    stationRef <- "'%20and%20station_id%20in%20('"
  }
  
  url.init <- paste0("http://my.sfwmd.gov/dbhydroplsql/water_quality_data.report_full?v_where_clause=where%20date_collected%20%3e%20'", 
                     start_date, 
                     stationRef, stations, "')",
                     "&v_target_code=", destination
  )
  httr::GET(url.init, write_disk(destfile, overwrite = TRUE), timeout(99999))
  if (import_data == TRUE) {
    output <- read.csv(destfile)
  }
 }


 dbh_stn_data_batch <- function(codes, ...) {
  for (i in 1:length(codes)) {
    tryCatch({
      print(codes[i])
      dbh_stn_data(stations = codes[i], ...)
    }, error = function(e) {print("Error in project ", codes[i], ": ", conditionMessage(e), "\n", quote = FALSE)})
  }
 }


 ### dbh_stn_data() usage
 # input_stns <- c("G300")
 # test <- dbh_stn_data(stations = input_stns)
 # summary(test[, 1:4])
 # 
 ## CAMB stand-in: 
 # input_stns <- c("'G300'") # 'G300','S5A','S5AE','S5AS','S5AU','S5AUS','S5AW'
 # test <- dbh_stn_data(stations = input_stns)
 # 
 # input_stns <- c('G300','S5A','S5AE','S5AS','S5AU','S5AUS','S5AW')
 # for (i in 1:length(input_stns)) {
 #   test <- dbh_stn_data(stations = input_stns[i])
 # }
 # summary(test[, 1:4])
 # 
 # dbh_stn_data_batch(codes = input_stns)
 # 
 # input_stns2 <- c("'ACRA%25'") # wild cards set by "%25"
 # test2 <- dbh_stn_data(stations = input_stns2)
 # summary(test2[, 1:4])
 # 
 # input_stns3 <- c("'ACRA%25','G300'") # database problem: cannot mix wild-cards and exact station names, or do a multiple-query wildcard search (all return empty .xls)
 # test3 <- dbh_stn_data(stations = input_stns3)
 # summary(test3[, 1:4])
 ###




 dbh_proj_data <- function(destfile = "proj_report_todaysDate.csv",
                          project_codes = "ACMEB",
                          start_date = "01-JAN-1960", # note format
                          destination = "file_csv",
                          rename_proj = TRUE, # should destination file be re-named to replace "proj" with project name
                          import_data = FALSE
 ) {
  if (grepl("todaysDate", destfile)) {
    destfile <- gsub(pattern = "todaysDate", replacement = format(Sys.Date(), "%Y%m%d"), x = destfile)
  }
  if (rename_proj == TRUE) {
    destfile <- gsub(pattern = "proj", replacement = gsub(pattern = "'", replace = "", x = project_codes), x = destfile)
  }
  
  if (grepl("%25", project_codes)) { 
    projectRef <- "project_code='"
  } else { 
    projectRef <- "project_code='"
  }
  
  url.init <- paste0("http://my.sfwmd.gov/dbhydroplsql/water_quality_data.report_full?v_where_clause=where%20", 
                     #"date_collected%20%3e%20='"start_date, 
                     projectRef, project_codes, "'",
                     "&v_target_code=", destination
  )
  #download.file(url = url.init, destfile)  # timeout problems even after setting options(timeout=24000000000000000000000000000000000000000)
  #RCurl::getURL(url = url.init) # crashed RStudio
  httr::GET(url.init, write_disk(destfile, overwrite = TRUE), timeout(99999))
  if (import_data == TRUE) {
    output <- read.csv(destfile)
  }
 }

 # dbh_proj_data(project_codes = "CAMB")

 dbh_proj_data_batch <- function(codes, ...) {
  for (i in 1:length(codes)) {
    tryCatch({
      print(codes[i])
      dbh_proj_data(project_codes = codes[i], ...)
    }, error = function(e) {print("Error in project ", codes[i], ": ", conditionMessage(e), "\n", quote = FALSE)})
  }
 }

 ### dbh_proj_data_batch() usage
 # STA1E_project_codes <- c("ENRR", 'ST1E', 'ST1W', 'CAMB', "EAAP", "ST1EM",
 #                          "ST1EG", "RAIN", "X", "TM", "PEST", "EVPA", "HGLE",
 #                          "ST1M", "ST1G", "LAB", "L8RT" # , "ST1F" appears to be fish data
 # )
 # STA1W_project_codes <- c("ENRM", "ENRP", "ENRU") # duplicates that overlap with STA1E: "ENRR", "LAB", "CAMB", "ST1E", "ST1W", "ST1G", "ST1M", "ST1F", "ST1EM")
 # STA2_project_codes <- c("STA2", "ST2M", "ST2G", "EAA", "USGS", "HGOS") # duplicates: c("CAMB", "EAAP", "EVPA", "PEST", "RAIN", "X") "ST2F" appears to be fish data
 # STA34_project_codes <- c("ST34", "ST34G", "ST34M", "A1FEB", "HOLY") # duplicates: c("CAMB", "EAA", "EVPA", "HGOS", "PEST", "RAIN", "USGS", "X") "ST34F" appears to be fish data
 # STA56_project_codes <- c("ST5R", "STA5", "STA6", "ST6M", "SEMI", "RTBG", "ST5M") # duplicates: c("PEST", "CAMB") "ST6F" appears to be fish data
 # 
 # dbh_proj_data_batch(codes = STA1E_project_codes)
 # dbh_proj_data_batch(codes = STA1W_project_codes)
 # dbh_proj_data_batch(codes = STA2_project_codes)
 # dbh_proj_data_batch(codes = STA34_project_codes)
 # dbh_proj_data_batch(codes = STA56_project_codes)
 ###

 if (reportType %in% "project") {
  dbh_proj_data_batch(destfile = paste0(working_dir,"/", "proj_report_todaysDate.csv"), codes = c(name))
 } 

 if (reportType %in% "station") {
  dbh_stn_data_batch(destfile = paste0(working_dir,"/", "stn_report_todaysDate.csv"), codes = c(name))
 } 

 ###
	#!/usr/bin/Rscript
	### functions to download DBHYDRO station and project data
	# setwd("/home/thill/RDATA/dbhydro/data/20180419/")

	library(httr)

	# command line arguments: order matters (right now)
	# arguments must be in order: [project/station name] [working_dir] [reportType: station or project]
	name <- gsub("[^[:alnum:] ]", "", commandArgs()[6])
	working_dir <- sapply(strsplit(commandArgs()[7], split = "\\["), "[", 1)
	reportType <- commandArgs()[8]

	cat("item name: ", name)
	cat("working directory for R: ", working_dir)
	cat("report type: ", reportType)

	# setwd(args[7])


	dbh_stn_data <- function(destfile = "stn_report_todaysDate.csv",
	stations = c("ACRA1"), # If one station in the list returns no data, the url request fails.
	rename_proj = TRUE,
	parameters = "all",
	report_type = "full", # full = long dataset - 1 line per sample; crosstab = wide dataset
	incl_qc_flags = TRUE,
	incl_flagged_data = FALSE,
	destination = "file_csv",
	start_date = "01-JAN-1960", # note format
	end_date = "today",
	import_data = FALSE) {

	if (rename_proj == TRUE) {
	destfile <- gsub(pattern = "stn", replacement = gsub(pattern = "'", replace = "", x = stations), x = destfile)
	}
	if (report_type == "full") {
	report_type_text <- paste0("&v_report=ctr_1_true")
	} else if (report_type == "crosstab") {
	report_type_text <- paste0("&v_report=ctr_w")
	} else stop("invalid input for 'report_type': must be 'either 'full' or 'crosstab'")

	if (incl_qc_flags == TRUE) {
	qc_flags_text <- paste0("&v_exc_qc=N")
	} else if (incl_qc_flags == FALSE) {
	qc_flags_text <- paste0("&v_exc_qc=Y")
	} else stop("invalid input for 'incl_qc_flags': must be TRUE/FALSE")

	if (incl_flagged_data == FALSE) {
	flagged_data_text <- paste0("&v_exc_flagged=Y")
	} else if (incl_flagged_data == TRUE) {
	flagged_data_text <- paste0("&v_exc_flagged=N")
	} else stop("invalid input for 'incl_flagged_data': must be TRUE/FALSE")

	if (!destination == "file_csv") {
	stop("Currently only csv output is supported. Use argument 'destination = 'file_csv'")
	}

	if (end_date == "today") {
	end_date <- toupper(format(Sys.Date(), "%d-%b-%Y"))
	}

	if (grepl("todaysDate", destfile)) {
	destfile <- gsub(pattern = "todaysDate", replacement = format(Sys.Date(), "%Y%m%d"), x = destfile)
	}


	if (grepl("%25", stations)) { # if wild cards are used in station references, url must use "like"
	stationRef <- "'%20and%20station_id%20like%20('"
	} else { # if no wild cards are used, url must use "in"
	stationRef <- "'%20and%20station_id%20in%20('"
	}

	url.init <- paste0("http://my.sfwmd.gov/dbhydroplsql/water_quality_data.report_full?v_where_clause=where%20date_collected%20%3e%20'",
	start_date,
	stationRef, stations, "')",
	"&v_target_code=", destination
	)
	httr::GET(url.init, write_disk(destfile, overwrite = TRUE), timeout(99999))
	if (import_data == TRUE) {
	output <- read.csv(destfile)
	}
	}


	dbh_stn_data_batch <- function(codes, ...) {
	for (i in 1:length(codes)) {
	tryCatch({
	print(codes[i])
	dbh_stn_data(stations = codes[i], ...)
	}, error = function(e) {print("Error in project ", codes[i], ": ", conditionMessage(e), "\n", quote = FALSE)})
	}
	}


	### dbh_stn_data() usage
	# input_stns <- c("G300")
	# test <- dbh_stn_data(stations = input_stns)
	# summary(test[, 1:4])
	#
	## CAMB stand-in:
	# input_stns <- c("'G300'") # 'G300','S5A','S5AE','S5AS','S5AU','S5AUS','S5AW'
	# test <- dbh_stn_data(stations = input_stns)
	#
	# input_stns <- c('G300','S5A','S5AE','S5AS','S5AU','S5AUS','S5AW')
	# for (i in 1:length(input_stns)) {
	# test <- dbh_stn_data(stations = input_stns[i])
	# }
	# summary(test[, 1:4])
	#
	# dbh_stn_data_batch(codes = input_stns)
	#
	# input_stns2 <- c("'ACRA%25'") # wild cards set by "%25"
	# test2 <- dbh_stn_data(stations = input_stns2)
	# summary(test2[, 1:4])
	#
	# input_stns3 <- c("'ACRA%25','G300'") # database problem: cannot mix wild-cards and exact station names, or do a multiple-query wildcard search (all return empty .xls)
	# test3 <- dbh_stn_data(stations = input_stns3)
	# summary(test3[, 1:4])
	###




	dbh_proj_data <- function(destfile = "proj_report_todaysDate.csv",
	project_codes = "ACMEB",
	start_date = "01-JAN-1960", # note format
	destination = "file_csv",
	rename_proj = TRUE, # should destination file be re-named to replace "proj" with project name
	import_data = FALSE
	) {
	if (grepl("todaysDate", destfile)) {
	destfile <- gsub(pattern = "todaysDate", replacement = format(Sys.Date(), "%Y%m%d"), x = destfile)
	}
	if (rename_proj == TRUE) {
	destfile <- gsub(pattern = "proj", replacement = gsub(pattern = "'", replace = "", x = project_codes), x = destfile)
	}

	if (grepl("%25", project_codes)) {
	projectRef <- "project_code='"
	} else {
	projectRef <- "project_code='"
	}

	url.init <- paste0("http://my.sfwmd.gov/dbhydroplsql/water_quality_data.report_full?v_where_clause=where%20",
	#"date_collected%20%3e%20='"start_date,
	projectRef, project_codes, "'",
	"&v_target_code=", destination
	)
	#download.file(url = url.init, destfile) # timeout problems even after setting options(timeout=24000000000000000000000000000000000000000)
	#RCurl::getURL(url = url.init) # crashed RStudio
	httr::GET(url.init, write_disk(destfile, overwrite = TRUE), timeout(99999))
	if (import_data == TRUE) {
	output <- read.csv(destfile)
	}
	}

	# dbh_proj_data(project_codes = "CAMB")

	dbh_proj_data_batch <- function(codes, ...) {
	for (i in 1:length(codes)) {
	tryCatch({
	print(codes[i])
	dbh_proj_data(project_codes = codes[i], ...)
	}, error = function(e) {print("Error in project ", codes[i], ": ", conditionMessage(e), "\n", quote = FALSE)})
	}
	}

	### dbh_proj_data_batch() usage
	# STA1E_project_codes <- c("ENRR", 'ST1E', 'ST1W', 'CAMB', "EAAP", "ST1EM",
	# "ST1EG", "RAIN", "X", "TM", "PEST", "EVPA", "HGLE",
	# "ST1M", "ST1G", "LAB", "L8RT" # , "ST1F" appears to be fish data
	# )
	# STA1W_project_codes <- c("ENRM", "ENRP", "ENRU") # duplicates that overlap with STA1E: "ENRR", "LAB", "CAMB", "ST1E", "ST1W", "ST1G", "ST1M", "ST1F", "ST1EM")
	# STA2_project_codes <- c("STA2", "ST2M", "ST2G", "EAA", "USGS", "HGOS") # duplicates: c("CAMB", "EAAP", "EVPA", "PEST", "RAIN", "X") "ST2F" appears to be fish data
	# STA34_project_codes <- c("ST34", "ST34G", "ST34M", "A1FEB", "HOLY") # duplicates: c("CAMB", "EAA", "EVPA", "HGOS", "PEST", "RAIN", "USGS", "X") "ST34F" appears to be fish data
	# STA56_project_codes <- c("ST5R", "STA5", "STA6", "ST6M", "SEMI", "RTBG", "ST5M") # duplicates: c("PEST", "CAMB") "ST6F" appears to be fish data
	#
	# dbh_proj_data_batch(codes = STA1E_project_codes)
	# dbh_proj_data_batch(codes = STA1W_project_codes)
	# dbh_proj_data_batch(codes = STA2_project_codes)
	# dbh_proj_data_batch(codes = STA34_project_codes)
	# dbh_proj_data_batch(codes = STA56_project_codes)
	###

	if (reportType %in% "project") {
	dbh_proj_data_batch(destfile = paste0(working_dir,"/", "proj_report_todaysDate.csv"), codes = c(name))
	}

	if (reportType %in% "station") {
	dbh_stn_data_batch(destfile = paste0(working_dir,"/", "stn_report_todaysDate.csv"), codes = c(name))
	}

	###