Last active
May 18, 2017 02:55
-
-
Save dantonnoriega/43392de4f1a4e7e378621ef026fd9096 to your computer and use it in GitHub Desktop.
extract data from durham county jail records
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(httr) | |
library(tidyverse) | |
library(xml2) | |
library(rvest) | |
devtools::install_github('dantonnoriega/xmltools') | |
# SET UP VARIABLES | |
# record_type: | |
# "0" (all incarcerated) | |
# "1" (last 24 hours) | |
# "30" (last 30 days) DEFAULT | |
record_type <- "0" | |
OUTPUT_DIR <- "~/Downloads" | |
# OUTPUT VARIABLES | |
url <- 'http://www2.durhamcountync.gov/sheriff/ips/default.aspx' | |
type <- switch(record_type, "0" = "all-incarcerated", "1" = "last-24-hours", "last-30-days") | |
FILENAME <- sprintf("durham-county-records-%s.csv", type) | |
# FUNCTIONS ------------------------------------------------ | |
# set up html session | |
durham_POST <- function(url, record_type = "30") { | |
session <- rvest::html_session(url) | |
form <- rvest::html_form(session)[[1]] # get form values for session | |
# str(form$fields) # look at 'options' for the "select" class | |
form <- rvest::set_values(form, ddlDateListing = record_type, ddlNameIndex = "All") | |
body <- rvest::pluck(form$fields, 'value') # convert to named list with pluck | |
POST(session$url, body = body, verbose()) # sending form data request | |
} | |
durham_convert <- function(dat, grp) { | |
v <- unique(grp) # get group values | |
d <- lapply(v, function(x) { | |
i <- which(grp %in% x) | |
name <- dat[i][[1]][[1]] %>% setNames(., 'name') | |
cols <- dat[i][2] %>% | |
unlist() %>% | |
tolower() %>% | |
gsub('\\W', '_', .) # replace punct/space _ | |
tbl <- do.call(rbind, dat[i][-c(1,2)]) %>% setNames(., cols) | |
as.tibble(cbind(tbl, name)) | |
}) | |
d <- dplyr::bind_rows(d) | |
# clean up data. order matters. | |
d %>% | |
mutate(days_in_jail_charge = gsub('<1', '.5', days_in_jail_charge)) %>% | |
mutate(bond_amount = gsub('\\W', '', bond_amount)) %>% | |
mutate(incarcerated = date_released == "[incarcerated]") %>% | |
mutate(date_released = gsub('\\[incarcerated\\]', NA_character_, date_released)) %>% | |
mutate_at(vars(starts_with("date_")), funs(as.record_type), format = '%m/%d/%Y') %>% | |
mutate_if(is.character, gsub, pattern = '\\[N/A\\]', replacement = NA_character_) %>% | |
mutate_at(vars(bond_amount, days_in_jail_charge), as.numeric) | |
} | |
# GET DATA ------------------------------------------------ | |
r <- durham_POST(url, record_type) | |
# extract data | |
xml <- content(r, "parsed") %>% | |
xml_find_all(".//form//table[@id='Table1']") | |
# the table has no nesting. sucks. | |
dat <- xmltools::xml_dig_df(xml)[[1]] | |
nms <- lapply(dat, names) | |
indx <- sapply(nms, is.null) # when null, its a name | |
grp <- cumsum(indx) # gives us an grouping | |
tbl <- durham_convert(dat, grp) | |
write_csv(tbl, file.path(OUTPUT_DIR, FILENAME)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment