-
-
Save GuillaumePressiat/27465b3f34a617527baf659c1fcf96f0 to your computer and use it in GitHub Desktop.
Convert xml to a nested data frame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` r | |
library(xml2) | |
library(dplyr) | |
library(purrr) | |
library(stringr) | |
# From the root node: | |
# If has_children, then recurse. | |
# Otherwise, attributes, value and children (nested) to data frame. | |
xml_to_df <- function(doc, ns = xml_ns(doc)) { | |
node_to_df <- function(node) { | |
# Filter the attributes for ones that aren't namespaces | |
# x <- list(.index = 0, .name = xml_name(node, ns)) | |
x <- list(.name = xml_name(node, ns)) | |
# Attributes as column headers, and their values in the first row | |
attrs <- xml_attrs(node) | |
if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]} | |
if (length(attrs) > 0) {x <- c(x, attrs)} | |
# Build data frame manually, to avoid as.data.frame's good intentions | |
children <- xml_children(node) | |
if (length(children) >= 1) { | |
x <- | |
children %>% | |
# Recurse here | |
map(node_to_df) %>% | |
split_by(".name") %>% | |
map(bind_rows) %>% | |
map(list) %>% | |
{c(x, .)} | |
attr(x, "row.names") <- 1L | |
class(x) <- c("tbl_df", "data.frame") | |
} else { | |
x$.value <- xml_text(node) | |
} | |
x | |
} | |
node_to_df(doc) | |
} | |
xml <- " | |
<messages> | |
<note id=\"501\"> | |
<to>Tove</to> | |
<from>Jani</from> | |
<heading>Reminder</heading> | |
<body>Don't forget me this weekend!</body> | |
</note> | |
<note id=\"502\"> | |
<to>Jani</to> | |
<from>Tove</from> | |
<heading>Re: Reminder</heading> | |
<body>I will not</body> | |
</note> | |
</messages>" | |
xml_df <- xml_to_df(read_xml(xml)) | |
xml_df | |
#> Source: local data frame [1 x 2] | |
#> | |
#> .name note | |
#> (chr) (chr) | |
#> 1 messages <tbl_df [2,6]> | |
xml_df$note | |
#> [[1]] | |
#> Source: local data frame [2 x 6] | |
#> | |
#> .name id body from heading to | |
#> (chr) (chr) (chr) (chr) (chr) (chr) | |
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> | |
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> | |
tidyr::unnest(xml_df$note[[1]], body) | |
#> Source: local data frame [2 x 7] | |
#> | |
#> .name id from heading to .name | |
#> (chr) (chr) (chr) (chr) (chr) (chr) | |
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body | |
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body | |
#> Variables not shown: .value (chr) | |
x <- read_xml(' | |
<root> | |
<doc1 xmlns = "http://foo.com"><baz /></doc1> | |
<doc2 xmlns = "http://bar.com"><baz /></doc2> | |
</root> | |
') | |
xml_to_df(x) | |
#> Source: local data frame [1 x 3] | |
#> | |
#> .name d1:doc1 d2:doc2 | |
#> (chr) (chr) (chr) | |
#> 1 root <tbl_df [1,2]> <tbl_df [1,2]> | |
# jennybc's linked code to import xml from Google sheets | |
library(googlesheets) | |
library(httr) | |
pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full" | |
(ss <- gs_ws_feed(pts_ws_feed)) | |
#> Spreadsheet title: test-gs-public-testing-sheet | |
#> Spreadsheet author: rpackagetest | |
#> Date of googlesheets registration: 2016-06-03 00:02:56 GMT | |
#> Date of last spreadsheet update: 2016-03-11 05:37:48 GMT | |
#> visibility: public | |
#> permissions: rw | |
#> version: new | |
#> | |
#> Contains 8 worksheets: | |
#> (Title): (Nominal worksheet extent as rows x columns) | |
#> embedded_empty_cells: 8 x 7 | |
#> special_chars: 1000 x 26 | |
#> diabolical_column_names: 4 x 8 | |
#> shipwrecks: 1000 x 26 | |
#> for_resizing: 1799 x 30 | |
#> for_updating: 1000 x 26 | |
#> empty: 1000 x 26 | |
#> colnames_only: 1000 x 26 | |
#> | |
#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8 | |
#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/ | |
col_names <- TRUE | |
ws <- "embedded_empty_cells" | |
index <- match(ws, ss$ws$ws_title) | |
the_url <- ss$ws$listfeed[index] | |
req <- GET(the_url) | |
rc <- read_xml(content(req, as = "text", encoding = "UTF-8")) | |
ns <- xml_ns_rename(xml_ns(rc), d1 = "feed") | |
# Parsing jennybc's xml with xml_to_df | |
x <- xml_to_df(rc, ns) | |
x$`feed:entry`[[1]] %>% | |
select(starts_with("gsx:")) %>% | |
map(bind_rows, .id = "row") %>% | |
bind_rows | |
#> Source: local data frame [23 x 3] | |
#> | |
#> row .name .value | |
#> (chr) (chr) (chr) | |
#> 1 1 gsx:_chk2m Americas | |
#> 2 3 gsx:_chk2m Americas | |
#> 3 4 gsx:_chk2m Americas | |
#> 4 1 gsx:country Argentina | |
#> 5 2 gsx:country Argentina | |
#> 6 3 gsx:country | |
#> 7 4 gsx:country Argentina | |
#> 8 1 gsx:gdppercap 5911.315053 | |
#> 9 2 gsx:gdppercap 6856.856212 | |
#> 10 3 gsx:gdppercap 7133.166023 | |
#> .. ... ... ... | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment