Skip to content

Instantly share code, notes, and snippets.

@dantonnoriega
Last active May 18, 2017 02:55
Show Gist options
  • Save dantonnoriega/43392de4f1a4e7e378621ef026fd9096 to your computer and use it in GitHub Desktop.
Save dantonnoriega/43392de4f1a4e7e378621ef026fd9096 to your computer and use it in GitHub Desktop.
extract data from durham county jail records
library(httr)
library(tidyverse)
library(xml2)
library(rvest)
devtools::install_github('dantonnoriega/xmltools')
# SET UP VARIABLES
# record_type:
# "0" (all incarcerated)
# "1" (last 24 hours)
# "30" (last 30 days) DEFAULT
record_type <- "0"
OUTPUT_DIR <- "~/Downloads"
# OUTPUT VARIABLES
url <- 'http://www2.durhamcountync.gov/sheriff/ips/default.aspx'
type <- switch(record_type, "0" = "all-incarcerated", "1" = "last-24-hours", "last-30-days")
FILENAME <- sprintf("durham-county-records-%s.csv", type)
# FUNCTIONS ------------------------------------------------
# set up html session
durham_POST <- function(url, record_type = "30") {
session <- rvest::html_session(url)
form <- rvest::html_form(session)[[1]] # get form values for session
# str(form$fields) # look at 'options' for the "select" class
form <- rvest::set_values(form, ddlDateListing = record_type, ddlNameIndex = "All")
body <- rvest::pluck(form$fields, 'value') # convert to named list with pluck
POST(session$url, body = body, verbose()) # sending form data request
}
durham_convert <- function(dat, grp) {
v <- unique(grp) # get group values
d <- lapply(v, function(x) {
i <- which(grp %in% x)
name <- dat[i][[1]][[1]] %>% setNames(., 'name')
cols <- dat[i][2] %>%
unlist() %>%
tolower() %>%
gsub('\\W', '_', .) # replace punct/space _
tbl <- do.call(rbind, dat[i][-c(1,2)]) %>% setNames(., cols)
as.tibble(cbind(tbl, name))
})
d <- dplyr::bind_rows(d)
# clean up data. order matters.
d %>%
mutate(days_in_jail_charge = gsub('<1', '.5', days_in_jail_charge)) %>%
mutate(bond_amount = gsub('\\W', '', bond_amount)) %>%
mutate(incarcerated = date_released == "[incarcerated]") %>%
mutate(date_released = gsub('\\[incarcerated\\]', NA_character_, date_released)) %>%
mutate_at(vars(starts_with("date_")), funs(as.record_type), format = '%m/%d/%Y') %>%
mutate_if(is.character, gsub, pattern = '\\[N/A\\]', replacement = NA_character_) %>%
mutate_at(vars(bond_amount, days_in_jail_charge), as.numeric)
}
# GET DATA ------------------------------------------------
r <- durham_POST(url, record_type)
# extract data
xml <- content(r, "parsed") %>%
xml_find_all(".//form//table[@id='Table1']")
# the table has no nesting. sucks.
dat <- xmltools::xml_dig_df(xml)[[1]]
nms <- lapply(dat, names)
indx <- sapply(nms, is.null) # when null, its a name
grp <- cumsum(indx) # gives us an grouping
tbl <- durham_convert(dat, grp)
write_csv(tbl, file.path(OUTPUT_DIR, FILENAME))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment