Skip to content

Instantly share code, notes, and snippets.

@ateucher
Last active November 10, 2016 18:22
Show Gist options
  • Save ateucher/490dc31dded8a763c0f7574f192cc28c to your computer and use it in GitHub Desktop.
Save ateucher/490dc31dded8a763c0f7574f192cc28c to your computer and use it in GitHub Desktop.
Parse BC Govt Directory to a list, and search for a name
library(xml2)
bc_dir <- read_xml("http://dir.gov.bc.ca/downloads/BCGOV_directory.xml")
find_person <- function(nm) {
search_string <- sprintf("//d1:PERSON/d1:NAME[contains(text(), \"%s\")]", nm)
matches <- xml_find_all(bc_dir, search_string)
if (length(matches) == 0) {
message("No matches found")
return(invisible(NULL))
}
hierach <- lapply(matches, function(x) rev(xml_parents(x)[-1])[-1])
orgs <- lapply(hierach, function(x) {
org <- xml_attr(x, "NAME")
names(org) <- c("Organization", paste("Org_Unit", seq_len(length(org) - 1), sep = "_"))
org
})
people <- lapply(xml_parent(matches), function(x) unlist(extract_person(x)))
if (length(people) == 1) {
person <- unlist(c(orgs, people))
} else {
choices <- lapply(seq_along(orgs), function(i) {
org <- orgs[[i]]
person <- people[[i]][c("title", "name")]
paste(c(org, person), collapse = "\n")
})
which <- menu(choices, title = "More than one match found. Please choose one")
person <- c(orgs[[which]], people[[which]])
}
person
}
extract_person <- function(person) {
title <- xml_text(xml_find_all(person, "d1:TITLE"))
name <- xml_text(xml_find_all(person, "d1:NAME"))
contact <- extract_contact(person)
list(title = title, name = name, contact = contact)
}
extract_contact <- function(unit) {
contacts_xml <- xml_find_all(unit, "d1:CONTACT")
if (length(contacts_xml) == 0) return(NULL)
contacts <- as.list(xml_text(contacts_xml))
setNames(contacts, unlist(xml_attr(contacts_xml, "TYPE")))
}
find_person("Teucher")
find_person("Hazlitt")
### Useful extra stuff --------------------------------------------------------
get_orgs <- function(dir) {
xml_find_all(dir, "d1:ORGANIZATION")
}
make_org_unit <- function(org) {
contacts <- extract_contact(org)
addresses <- extract_address(org)
people <- get_people(org)
org_units_xml <- get_org_units(org)
if (!is.null(org_units_xml)) {
org_units <- setNames(lapply(org_units_xml, make_org_unit),
unlist(xml_attr(org_units_xml, "NAME")))
} else {
org_units <- NULL
}
filter_null(list(contacts = contacts, addresses = addresses, people = people,
org_units = org_units))
}
get_org_units <- function(x) {
org_units <- xml_find_all(x, "d1:ORGUNIT")
if (length(org_units) == 0) return(NULL)
org_units
}
get_people <- function(org_unit) {
people <- xml_find_all(org_unit, "d1:PERSON")
if (length(people) == 0) return(NULL)
lapply(people, extract_person)
}
extract_address <- function(unit) {
addresses_xml <- xml_find_all(unit, "d1:ADDRESS")
if (length(addresses_xml) == 0) return(NULL)
addresses <- as.list(xml_text(addresses_xml))
setNames(addresses, unlist(xml_attr(addresses_xml, "TYPE")))
}
filter_null <- function(x) Filter(Negate(is.null), x)
## Useless Extra Stuff ---------------------------------------------------------
# xml_name(bc_dir, ns)
#
# xml_children(bc_dir)
#
# html_structure(bc_dir)
#
# xml_contents(bc_dir)
#
#
# ## Get list of organizations
# orgs <- xml_find_all(bc_dir, "d1:ORGANIZATION")
# orgs_list <- xml_attr(orgs, "NAME")
#
# ## List of org units - this can be deeply recursive
# org <- xml_find_all(bc_dir, "d1:ORGANIZATION[@NAME='Ministry of Environment']")
# org_list <- list(name = xml_attr(org, "NAME"))
#
# org_unit_names <- xml_attr(org_units, "NAME")
# org_list$org_units = setNames(lapply(seq_along(org_unit_names), function(x) {
# xml_children(org_units[[x]])
# }), org_unit_names)
#
# xml_children(org_units[[3]])
#
# xml_children(test)
# xml_find_all(xml_children(xml_children(test)), "//d1:ORGANIZATION//PERSON")
#
#
#
#
#
# name <- xml_text(xml_find_all(bc_dir, "//d1:NAME"))
# title <- xml_text(xml_find_all(bc_dir, "//d1:TITLE"))
# email <- xml_find_all(bc_dir, "//d1:CONTACT[@TYPE='email']", xml_ns(bc_dir))
#
#
# foo <- strsplit(names, ",")
# lens <- unlist(lapply(foo, length))
#
# foo[lens > 3]
#
# xml_text(people)
#
# bc_list <- as_list(bc_dir)
#
#
# x <- read_xml("<foo><bar><baz/></bar><baz/></foo>")
# xml_find_all(x, ".//baz")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment