Last active
November 10, 2016 18:22
-
-
Save ateucher/490dc31dded8a763c0f7574f192cc28c to your computer and use it in GitHub Desktop.
Parse BC Govt Directory to a list, and search for a name
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xml2) | |
bc_dir <- read_xml("http://dir.gov.bc.ca/downloads/BCGOV_directory.xml") | |
find_person <- function(nm) { | |
search_string <- sprintf("//d1:PERSON/d1:NAME[contains(text(), \"%s\")]", nm) | |
matches <- xml_find_all(bc_dir, search_string) | |
if (length(matches) == 0) { | |
message("No matches found") | |
return(invisible(NULL)) | |
} | |
hierach <- lapply(matches, function(x) rev(xml_parents(x)[-1])[-1]) | |
orgs <- lapply(hierach, function(x) { | |
org <- xml_attr(x, "NAME") | |
names(org) <- c("Organization", paste("Org_Unit", seq_len(length(org) - 1), sep = "_")) | |
org | |
}) | |
people <- lapply(xml_parent(matches), function(x) unlist(extract_person(x))) | |
if (length(people) == 1) { | |
person <- unlist(c(orgs, people)) | |
} else { | |
choices <- lapply(seq_along(orgs), function(i) { | |
org <- orgs[[i]] | |
person <- people[[i]][c("title", "name")] | |
paste(c(org, person), collapse = "\n") | |
}) | |
which <- menu(choices, title = "More than one match found. Please choose one") | |
person <- c(orgs[[which]], people[[which]]) | |
} | |
person | |
} | |
extract_person <- function(person) { | |
title <- xml_text(xml_find_all(person, "d1:TITLE")) | |
name <- xml_text(xml_find_all(person, "d1:NAME")) | |
contact <- extract_contact(person) | |
list(title = title, name = name, contact = contact) | |
} | |
extract_contact <- function(unit) { | |
contacts_xml <- xml_find_all(unit, "d1:CONTACT") | |
if (length(contacts_xml) == 0) return(NULL) | |
contacts <- as.list(xml_text(contacts_xml)) | |
setNames(contacts, unlist(xml_attr(contacts_xml, "TYPE"))) | |
} | |
find_person("Teucher") | |
find_person("Hazlitt") | |
### Useful extra stuff -------------------------------------------------------- | |
get_orgs <- function(dir) { | |
xml_find_all(dir, "d1:ORGANIZATION") | |
} | |
make_org_unit <- function(org) { | |
contacts <- extract_contact(org) | |
addresses <- extract_address(org) | |
people <- get_people(org) | |
org_units_xml <- get_org_units(org) | |
if (!is.null(org_units_xml)) { | |
org_units <- setNames(lapply(org_units_xml, make_org_unit), | |
unlist(xml_attr(org_units_xml, "NAME"))) | |
} else { | |
org_units <- NULL | |
} | |
filter_null(list(contacts = contacts, addresses = addresses, people = people, | |
org_units = org_units)) | |
} | |
get_org_units <- function(x) { | |
org_units <- xml_find_all(x, "d1:ORGUNIT") | |
if (length(org_units) == 0) return(NULL) | |
org_units | |
} | |
get_people <- function(org_unit) { | |
people <- xml_find_all(org_unit, "d1:PERSON") | |
if (length(people) == 0) return(NULL) | |
lapply(people, extract_person) | |
} | |
extract_address <- function(unit) { | |
addresses_xml <- xml_find_all(unit, "d1:ADDRESS") | |
if (length(addresses_xml) == 0) return(NULL) | |
addresses <- as.list(xml_text(addresses_xml)) | |
setNames(addresses, unlist(xml_attr(addresses_xml, "TYPE"))) | |
} | |
filter_null <- function(x) Filter(Negate(is.null), x) | |
## Useless Extra Stuff --------------------------------------------------------- | |
# xml_name(bc_dir, ns) | |
# | |
# xml_children(bc_dir) | |
# | |
# html_structure(bc_dir) | |
# | |
# xml_contents(bc_dir) | |
# | |
# | |
# ## Get list of organizations | |
# orgs <- xml_find_all(bc_dir, "d1:ORGANIZATION") | |
# orgs_list <- xml_attr(orgs, "NAME") | |
# | |
# ## List of org units - this can be deeply recursive | |
# org <- xml_find_all(bc_dir, "d1:ORGANIZATION[@NAME='Ministry of Environment']") | |
# org_list <- list(name = xml_attr(org, "NAME")) | |
# | |
# org_unit_names <- xml_attr(org_units, "NAME") | |
# org_list$org_units = setNames(lapply(seq_along(org_unit_names), function(x) { | |
# xml_children(org_units[[x]]) | |
# }), org_unit_names) | |
# | |
# xml_children(org_units[[3]]) | |
# | |
# xml_children(test) | |
# xml_find_all(xml_children(xml_children(test)), "//d1:ORGANIZATION//PERSON") | |
# | |
# | |
# | |
# | |
# | |
# name <- xml_text(xml_find_all(bc_dir, "//d1:NAME")) | |
# title <- xml_text(xml_find_all(bc_dir, "//d1:TITLE")) | |
# email <- xml_find_all(bc_dir, "//d1:CONTACT[@TYPE='email']", xml_ns(bc_dir)) | |
# | |
# | |
# foo <- strsplit(names, ",") | |
# lens <- unlist(lapply(foo, length)) | |
# | |
# foo[lens > 3] | |
# | |
# xml_text(people) | |
# | |
# bc_list <- as_list(bc_dir) | |
# | |
# | |
# x <- read_xml("<foo><bar><baz/></bar><baz/></foo>") | |
# xml_find_all(x, ".//baz") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment