Created
September 7, 2015 16:43
-
-
Save dmpe/5aec87f0c7a5ae2115ca to your computer and use it in GitHub Desktop.
returnData.R used for benchmarking RSocrata row binding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# An interface to data hosted online in Socrata data repositories | |
# This is the main file which uses other functions to download data from a Socrata repositories | |
# | |
# Author: Hugh J. Devlin, Ph.D et al. 2013-08-28 | |
############################################################################### | |
# library("httr") # for access to the HTTP header | |
# library("jsonlite") # for parsing data types from Socrata | |
# library("mime") # for guessing mime type | |
# library("geojsonio") # for geospatial json | |
# library("plyr") # for a faster binding of rows | |
#' Content parsers | |
#' | |
#' Return a data frame for csv or json | |
#' | |
#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} | |
#' @importFrom httr content | |
#' @importFrom geojsonio geojson_read | |
#' @param response - an httr response object | |
#' @return data frame, possibly empty | |
#' @noRd | |
getContentAsDataFrame <- function(response) { | |
mimeType <- response$header$'content-type' | |
# skip optional parameters | |
sep <- regexpr(';', mimeType)[1] | |
if (sep != -1) { | |
mimeType <- substr(mimeType, 0, sep[1] - 1) | |
} | |
switch(mimeType, | |
"text/csv" = | |
httr::content(response), # automatic parsing | |
"application/json" = | |
if (httr::content(response, as = "text") == "[ ]") { # empty json? | |
data.frame() # empty data frame | |
} else { | |
data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors = FALSE) | |
} | |
) | |
} | |
#' Get a full Socrata data set as an R data frame | |
#' | |
#' @description Manages throttling and POSIX date-time conversions. We advise to use .csv suffix | |
#' rather than .json, even though both are supported. With .json, however, there may be some bugs. | |
#' | |
#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL, | |
#' or Socrata Open Data Application Program Interface (SODA) query | |
#' requesting a comma-separated download format (.csv suffix), | |
#' May include SoQL parameters, and it is now assumed to include SODA \code{limit} | |
#' & \code{offset} parameters. | |
#' Either use a compelete URL or use parameters below to construct your URL. | |
#' @param app_token - a (non-required) string; SODA API token can be used to query the data | |
#' portal \url{http://dev.socrata.com/consumers/getting-started.html} | |
#' @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see | |
#' \url{http://dev.socrata.com/docs/queries.html}. | |
#' @param limit - defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}. | |
#' @param offset - defaults to 0. See \url{http://dev.socrata.com/docs/paging.html}. | |
#' @param output - defaults to csv; one of \code{"csv" or "json"}. | |
#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} | |
#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} | |
#' | |
#' @section TODO: \url{https://github.com/Chicago/RSocrata/issues/14} | |
#' @section Issue: If you get something like \code{Error in rbind(deparse.level, ...) : | |
#' numbers of columns of arguments do not match} when using "json" output, this is a known bug | |
#' \url{https://github.com/Chicago/RSocrata/issues/19}! Use instead csv output for time being. | |
#' | |
#' @return a data frame with POSIX dates in csv or json format. | |
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} | |
#' | |
#' @examples | |
#' \dontrun{ | |
#' df_csv <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv") | |
#' df_manual2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu") | |
#' df_manual3<-read.socrata(domain="http://data.cityofchicago.org/", fourByFour = "ydr8-5enu", | |
#' output="csv") | |
#' } | |
#' | |
#' @importFrom httr parse_url build_url | |
#' @importFrom mime guess_type | |
#' @importFrom plyr rbind.fill | |
#' | |
#' @export | |
read.socrataRBIND <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, | |
query = NULL, offset = 0, output = "csv") { | |
if (is.null(url) == TRUE) { | |
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output) | |
url <- httr::parse_url(buildUrl) | |
} | |
# check url syntax, allow human-readable Socrata url | |
validUrl <- validateUrl(url, app_token) | |
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit)) | |
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path)) | |
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) { | |
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO") | |
} | |
response <- errorHandling(validUrl) | |
results <- getContentAsDataFrame(response) | |
dataTypes <- getSodaTypes(response) | |
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1]) | |
## More to come? Loop over pages implicitly | |
while (nrow(results) < rowCount) { | |
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) | |
response <- errorHandling(query_url) | |
page <- getContentAsDataFrame(response) | |
results <- rbind(results, page) # accumulate data | |
} | |
# Convert Socrata calendar dates to POSIX format | |
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some, | |
# then convert them to POSIX format | |
if (!is.null(dataTypes)) { | |
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) | |
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) { | |
results[[columnName]] <- posixify(results[[columnName]]) | |
} | |
} | |
return(results) | |
} | |
#' @title PLYR | |
#' @importFrom httr parse_url build_url | |
#' @importFrom mime guess_type | |
#' @importFrom plyr rbind.fill | |
#' | |
#' @export | |
read.socrataPLYR <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, | |
query = NULL, offset = 0, output = "csv") { | |
if (is.null(url) == TRUE) { | |
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output) | |
url <- httr::parse_url(buildUrl) | |
} | |
# check url syntax, allow human-readable Socrata url | |
validUrl <- validateUrl(url, app_token) | |
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit)) | |
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path)) | |
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) { | |
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO") | |
} | |
response <- errorHandling(validUrl) | |
results <- getContentAsDataFrame(response) | |
dataTypes <- getSodaTypes(response) | |
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1]) | |
## More to come? Loop over pages implicitly | |
while (nrow(results) < rowCount) { | |
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) | |
response <- errorHandling(query_url) | |
page <- getContentAsDataFrame(response) | |
results <- plyr::rbind.fill(results, page) # accumulate data | |
} | |
# Convert Socrata calendar dates to POSIX format | |
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some, | |
# then convert them to POSIX format | |
if (!is.null(dataTypes)) { | |
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) | |
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) { | |
results[[columnName]] <- posixify(results[[columnName]]) | |
} | |
} | |
return(results) | |
} | |
#' @title DPLYR | |
#' @importFrom httr parse_url build_url | |
#' @importFrom mime guess_type | |
#' @importFrom dplyr bind_rows | |
#' | |
#' @export | |
read.socrataDPLYR <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, | |
query = NULL, offset = 0, output = "csv") { | |
if (is.null(url) == TRUE) { | |
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output) | |
url <- httr::parse_url(buildUrl) | |
} | |
# check url syntax, allow human-readable Socrata url | |
validUrl <- validateUrl(url, app_token) | |
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit)) | |
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path)) | |
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) { | |
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO") | |
} | |
response <- errorHandling(validUrl) | |
results <- getContentAsDataFrame(response) | |
dataTypes <- getSodaTypes(response) | |
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1]) | |
## More to come? Loop over pages implicitly | |
while (nrow(results) < rowCount) { | |
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) | |
response <- errorHandling(query_url) | |
page <- getContentAsDataFrame(response) | |
results <- dplyr::bind_rows(results, page) # accumulate data | |
} | |
# Convert Socrata calendar dates to POSIX format | |
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some, | |
# then convert them to POSIX format | |
if (!is.null(dataTypes)) { | |
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) | |
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) { | |
results[[columnName]] <- posixify(results[[columnName]]) | |
} | |
} | |
return(results) | |
} | |
#' @title DATATABLE | |
#' @importFrom httr parse_url build_url | |
#' @importFrom mime guess_type | |
#' @importFrom data.table rbindlist | |
#' | |
#' @export | |
read.socrataDATATABLE <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, | |
query = NULL, offset = 0, output = "csv") { | |
if (is.null(url) == TRUE) { | |
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output) | |
url <- httr::parse_url(buildUrl) | |
} | |
# check url syntax, allow human-readable Socrata url | |
validUrl <- validateUrl(url, app_token) | |
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit)) | |
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path)) | |
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) { | |
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO") | |
} | |
response <- errorHandling(validUrl) | |
results <- getContentAsDataFrame(response) | |
dataTypes <- getSodaTypes(response) | |
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1]) | |
## More to come? Loop over pages implicitly | |
while (nrow(results) < rowCount) { | |
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) | |
response <- errorHandling(query_url) | |
page <- getContentAsDataFrame(response) | |
results <- data.table::rbindlist(list(results, page), fill = TRUE) # accumulate data | |
} | |
# Convert Socrata calendar dates to POSIX format | |
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some, | |
# then convert them to POSIX format | |
if (!is.null(dataTypes)) { | |
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) | |
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) { | |
results[[columnName]] <- posixify(results[[columnName]]) | |
} | |
} | |
return(results) | |
} | |
#' Download GeoJSON data using geojsonio package | |
#' | |
#' @param what - \link{geojsonio} What to return format is choosen. One of list (default) or \code{sp}. | |
#' @param parse - \link{geojsonio} Parse geojson to data.frame like structures if possible or not. Default: FALSE (~not) | |
#' @param method - \link{geojsonio} One of "web" or "local" (default). Matches on partial strings. | |
#' @param ... - other arguments from \link{geojsonio} package for geojson_read method | |
#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL, | |
#' requesting a .geojson suffix. | |
#' | |
#' @importFrom geojsonio geojson_read | |
#' @importFrom httr build_url parse_url | |
#' @importFrom mime guess_type | |
#' | |
#' @return Returns a \code{sp} object, which is the default option here. | |
#' | |
#' @examples | |
#' \dontrun{ | |
#' df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") | |
#' } | |
#' | |
#' @export | |
read.socrataGEO <- function(url = "", method = "local", what = "sp", parse = FALSE, ...) { | |
validUrl <- httr::parse_url(url) | |
mimeType <- mime::guess_type(validUrl$path) | |
if (mimeType == "application/vnd.geo+json") { | |
results <- geojsonio::geojson_read(url, method = method, parse = parse, what = what, ...) | |
} | |
return(results) | |
} | |
#' Get the SoDA 2 data types | |
#' | |
#' Get the Socrata Open Data Application Program Interface data types from the http response header. | |
#' Used only for CSV and JSON, not GeoJSON | |
#' | |
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} | |
#' @param response - headers attribute from an httr response object | |
#' @return a named vector mapping field names to data types | |
#' @importFrom jsonlite fromJSON | |
#' @noRd | |
getSodaTypes <- function(response) { | |
if (!is.null(response$headers[['x-soda2-types']]) | !is.null(response$headers[['x-soda2-fields']])) { | |
result <- jsonlite::fromJSON(response$headers[['x-soda2-types']]) | |
names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']]) | |
return(result) | |
} else { | |
NULL | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment