Skip to content

Instantly share code, notes, and snippets.

@dmpe
Created September 7, 2015 16:43
Show Gist options
  • Save dmpe/5aec87f0c7a5ae2115ca to your computer and use it in GitHub Desktop.
Save dmpe/5aec87f0c7a5ae2115ca to your computer and use it in GitHub Desktop.
returnData.R used for benchmarking RSocrata row binding
# An interface to data hosted online in Socrata data repositories
# This is the main file which uses other functions to download data from a Socrata repositories
#
# Author: Hugh J. Devlin, Ph.D et al. 2013-08-28
###############################################################################
# library("httr") # for access to the HTTP header
# library("jsonlite") # for parsing data types from Socrata
# library("mime") # for guessing mime type
# library("geojsonio") # for geospatial json
# library("plyr") # for a faster binding of rows
#' Content parsers
#'
#' Return a data frame for csv or json
#'
#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org}
#' @importFrom httr content
#' @importFrom geojsonio geojson_read
#' @param response - an httr response object
#' @return data frame, possibly empty
#' @noRd
getContentAsDataFrame <- function(response) {
mimeType <- response$header$'content-type'
# skip optional parameters
sep <- regexpr(';', mimeType)[1]
if (sep != -1) {
mimeType <- substr(mimeType, 0, sep[1] - 1)
}
switch(mimeType,
"text/csv" =
httr::content(response), # automatic parsing
"application/json" =
if (httr::content(response, as = "text") == "[ ]") { # empty json?
data.frame() # empty data frame
} else {
data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors = FALSE)
}
)
}
#' Get a full Socrata data set as an R data frame
#'
#' @description Manages throttling and POSIX date-time conversions. We advise to use .csv suffix
#' rather than .json, even though both are supported. With .json, however, there may be some bugs.
#'
#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL,
#' or Socrata Open Data Application Program Interface (SODA) query
#' requesting a comma-separated download format (.csv suffix),
#' May include SoQL parameters, and it is now assumed to include SODA \code{limit}
#' & \code{offset} parameters.
#' Either use a compelete URL or use parameters below to construct your URL.
#' @param app_token - a (non-required) string; SODA API token can be used to query the data
#' portal \url{http://dev.socrata.com/consumers/getting-started.html}
#' @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see
#' \url{http://dev.socrata.com/docs/queries.html}.
#' @param limit - defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}.
#' @param offset - defaults to 0. See \url{http://dev.socrata.com/docs/paging.html}.
#' @param output - defaults to csv; one of \code{"csv" or "json"}.
#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org}
#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}}
#'
#' @section TODO: \url{https://github.com/Chicago/RSocrata/issues/14}
#' @section Issue: If you get something like \code{Error in rbind(deparse.level, ...) :
#' numbers of columns of arguments do not match} when using "json" output, this is a known bug
#' \url{https://github.com/Chicago/RSocrata/issues/19}! Use instead csv output for time being.
#'
#' @return a data frame with POSIX dates in csv or json format.
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#'
#' @examples
#' \dontrun{
#' df_csv <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv")
#' df_manual2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu")
#' df_manual3<-read.socrata(domain="http://data.cityofchicago.org/", fourByFour = "ydr8-5enu",
#' output="csv")
#' }
#'
#' @importFrom httr parse_url build_url
#' @importFrom mime guess_type
#' @importFrom plyr rbind.fill
#'
#' @export
read.socrataRBIND <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL,
query = NULL, offset = 0, output = "csv") {
if (is.null(url) == TRUE) {
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output)
url <- httr::parse_url(buildUrl)
}
# check url syntax, allow human-readable Socrata url
validUrl <- validateUrl(url, app_token)
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit))
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path))
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) {
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO")
}
response <- errorHandling(validUrl)
results <- getContentAsDataFrame(response)
dataTypes <- getSodaTypes(response)
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1])
## More to come? Loop over pages implicitly
while (nrow(results) < rowCount) {
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit)
response <- errorHandling(query_url)
page <- getContentAsDataFrame(response)
results <- rbind(results, page) # accumulate data
}
# Convert Socrata calendar dates to POSIX format
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some,
# then convert them to POSIX format
if (!is.null(dataTypes)) {
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))])
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) {
results[[columnName]] <- posixify(results[[columnName]])
}
}
return(results)
}
#' @title PLYR
#' @importFrom httr parse_url build_url
#' @importFrom mime guess_type
#' @importFrom plyr rbind.fill
#'
#' @export
read.socrataPLYR <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL,
query = NULL, offset = 0, output = "csv") {
if (is.null(url) == TRUE) {
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output)
url <- httr::parse_url(buildUrl)
}
# check url syntax, allow human-readable Socrata url
validUrl <- validateUrl(url, app_token)
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit))
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path))
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) {
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO")
}
response <- errorHandling(validUrl)
results <- getContentAsDataFrame(response)
dataTypes <- getSodaTypes(response)
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1])
## More to come? Loop over pages implicitly
while (nrow(results) < rowCount) {
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit)
response <- errorHandling(query_url)
page <- getContentAsDataFrame(response)
results <- plyr::rbind.fill(results, page) # accumulate data
}
# Convert Socrata calendar dates to POSIX format
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some,
# then convert them to POSIX format
if (!is.null(dataTypes)) {
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))])
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) {
results[[columnName]] <- posixify(results[[columnName]])
}
}
return(results)
}
#' @title DPLYR
#' @importFrom httr parse_url build_url
#' @importFrom mime guess_type
#' @importFrom dplyr bind_rows
#'
#' @export
read.socrataDPLYR <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL,
query = NULL, offset = 0, output = "csv") {
if (is.null(url) == TRUE) {
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output)
url <- httr::parse_url(buildUrl)
}
# check url syntax, allow human-readable Socrata url
validUrl <- validateUrl(url, app_token)
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit))
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path))
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) {
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO")
}
response <- errorHandling(validUrl)
results <- getContentAsDataFrame(response)
dataTypes <- getSodaTypes(response)
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1])
## More to come? Loop over pages implicitly
while (nrow(results) < rowCount) {
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit)
response <- errorHandling(query_url)
page <- getContentAsDataFrame(response)
results <- dplyr::bind_rows(results, page) # accumulate data
}
# Convert Socrata calendar dates to POSIX format
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some,
# then convert them to POSIX format
if (!is.null(dataTypes)) {
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))])
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) {
results[[columnName]] <- posixify(results[[columnName]])
}
}
return(results)
}
#' @title DATATABLE
#' @importFrom httr parse_url build_url
#' @importFrom mime guess_type
#' @importFrom data.table rbindlist
#'
#' @export
read.socrataDATATABLE <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL,
query = NULL, offset = 0, output = "csv") {
if (is.null(url) == TRUE) {
buildUrl <- paste0(domain, "resource/", fourByFour, ".", output)
url <- httr::parse_url(buildUrl)
}
# check url syntax, allow human-readable Socrata url
validUrl <- validateUrl(url, app_token)
parsedUrl <- httr::parse_url(paste0(validUrl, "&$limit=", limit))
mimeType <- mime::guess_type(cleanAmp(parsedUrl$path))
if (!(mimeType %in% c("text/csv","application/json", "text/plain"))) {
stop(mimeType, " not a supported data format. Try JSON or CSV. For GeoJSON use: read.socrataGEO")
}
response <- errorHandling(validUrl)
results <- getContentAsDataFrame(response)
dataTypes <- getSodaTypes(response)
rowCount <- as.numeric(getMetadata(cleanQuest(validUrl))[1])
## More to come? Loop over pages implicitly
while (nrow(results) < rowCount) {
query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit)
response <- errorHandling(query_url)
page <- getContentAsDataFrame(response)
results <- data.table::rbindlist(list(results, page), fill = TRUE) # accumulate data
}
# Convert Socrata calendar dates to POSIX format
# Check for column names that are not NA and which dataType is a "calendar_date". If there are some,
# then convert them to POSIX format
if (!is.null(dataTypes)) {
for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))])
& dataTypes[fieldName(colnames(results))] == "calendar_date"]) {
results[[columnName]] <- posixify(results[[columnName]])
}
}
return(results)
}
#' Download GeoJSON data using geojsonio package
#'
#' @param what - \link{geojsonio} What to return format is choosen. One of list (default) or \code{sp}.
#' @param parse - \link{geojsonio} Parse geojson to data.frame like structures if possible or not. Default: FALSE (~not)
#' @param method - \link{geojsonio} One of "web" or "local" (default). Matches on partial strings.
#' @param ... - other arguments from \link{geojsonio} package for geojson_read method
#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL,
#' requesting a .geojson suffix.
#'
#' @importFrom geojsonio geojson_read
#' @importFrom httr build_url parse_url
#' @importFrom mime guess_type
#'
#' @return Returns a \code{sp} object, which is the default option here.
#'
#' @examples
#' \dontrun{
#' df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson")
#' }
#'
#' @export
read.socrataGEO <- function(url = "", method = "local", what = "sp", parse = FALSE, ...) {
validUrl <- httr::parse_url(url)
mimeType <- mime::guess_type(validUrl$path)
if (mimeType == "application/vnd.geo+json") {
results <- geojsonio::geojson_read(url, method = method, parse = parse, what = what, ...)
}
return(results)
}
#' Get the SoDA 2 data types
#'
#' Get the Socrata Open Data Application Program Interface data types from the http response header.
#' Used only for CSV and JSON, not GeoJSON
#'
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @param response - headers attribute from an httr response object
#' @return a named vector mapping field names to data types
#' @importFrom jsonlite fromJSON
#' @noRd
getSodaTypes <- function(response) {
if (!is.null(response$headers[['x-soda2-types']]) | !is.null(response$headers[['x-soda2-fields']])) {
result <- jsonlite::fromJSON(response$headers[['x-soda2-types']])
names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']])
return(result)
} else {
NULL
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment