Last active
March 22, 2023 05:01
-
-
Save nacnudus/55e601ed7d466b6a22bb36002f23aa64 to your computer and use it in GitHub Desktop.
Convert xml to a nested data frame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` r | |
library(xml2) | |
library(dplyr) | |
library(purrr) | |
library(stringr) | |
# From the root node: | |
# If has_children, then recurse. | |
# Otherwise, attributes, value and children (nested) to data frame. | |
xml_to_df <- function(doc, ns = xml_ns(doc)) { | |
node_to_df <- function(node) { | |
# Filter the attributes for ones that aren't namespaces | |
# x <- list(.index = 0, .name = xml_name(node, ns)) | |
x <- list(.name = xml_name(node, ns)) | |
# Attributes as column headers, and their values in the first row | |
attrs <- xml_attrs(node) | |
if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]} | |
if (length(attrs) > 0) {x <- c(x, attrs)} | |
# Build data frame manually, to avoid as.data.frame's good intentions | |
children <- xml_children(node) | |
if (length(children) >= 1) { | |
x <- | |
children %>% | |
# Recurse here | |
map(node_to_df) %>% | |
split_by(".name") %>% | |
map(bind_rows) %>% | |
map(list) %>% | |
{c(x, .)} | |
attr(x, "row.names") <- 1L | |
class(x) <- c("tbl_df", "data.frame") | |
} else { | |
x$.value <- xml_text(node) | |
} | |
x | |
} | |
node_to_df(doc) | |
} | |
xml <- " | |
<messages> | |
<note id=\"501\"> | |
<to>Tove</to> | |
<from>Jani</from> | |
<heading>Reminder</heading> | |
<body>Don't forget me this weekend!</body> | |
</note> | |
<note id=\"502\"> | |
<to>Jani</to> | |
<from>Tove</from> | |
<heading>Re: Reminder</heading> | |
<body>I will not</body> | |
</note> | |
</messages>" | |
xml_df <- xml_to_df(read_xml(xml)) | |
xml_df | |
#> Source: local data frame [1 x 2] | |
#> | |
#> .name note | |
#> (chr) (chr) | |
#> 1 messages <tbl_df [2,6]> | |
xml_df$note | |
#> [[1]] | |
#> Source: local data frame [2 x 6] | |
#> | |
#> .name id body from heading to | |
#> (chr) (chr) (chr) (chr) (chr) (chr) | |
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> | |
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> | |
tidyr::unnest(xml_df$note[[1]], body) | |
#> Source: local data frame [2 x 7] | |
#> | |
#> .name id from heading to .name | |
#> (chr) (chr) (chr) (chr) (chr) (chr) | |
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body | |
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body | |
#> Variables not shown: .value (chr) | |
x <- read_xml(' | |
<root> | |
<doc1 xmlns = "http://foo.com"><baz /></doc1> | |
<doc2 xmlns = "http://bar.com"><baz /></doc2> | |
</root> | |
') | |
xml_to_df(x) | |
#> Source: local data frame [1 x 3] | |
#> | |
#> .name d1:doc1 d2:doc2 | |
#> (chr) (chr) (chr) | |
#> 1 root <tbl_df [1,2]> <tbl_df [1,2]> | |
# jennybc's linked code to import xml from Google sheets | |
library(googlesheets) | |
library(httr) | |
pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full" | |
(ss <- gs_ws_feed(pts_ws_feed)) | |
#> Spreadsheet title: test-gs-public-testing-sheet | |
#> Spreadsheet author: rpackagetest | |
#> Date of googlesheets registration: 2016-06-03 00:02:56 GMT | |
#> Date of last spreadsheet update: 2016-03-11 05:37:48 GMT | |
#> visibility: public | |
#> permissions: rw | |
#> version: new | |
#> | |
#> Contains 8 worksheets: | |
#> (Title): (Nominal worksheet extent as rows x columns) | |
#> embedded_empty_cells: 8 x 7 | |
#> special_chars: 1000 x 26 | |
#> diabolical_column_names: 4 x 8 | |
#> shipwrecks: 1000 x 26 | |
#> for_resizing: 1799 x 30 | |
#> for_updating: 1000 x 26 | |
#> empty: 1000 x 26 | |
#> colnames_only: 1000 x 26 | |
#> | |
#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8 | |
#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/ | |
col_names <- TRUE | |
ws <- "embedded_empty_cells" | |
index <- match(ws, ss$ws$ws_title) | |
the_url <- ss$ws$listfeed[index] | |
req <- GET(the_url) | |
rc <- read_xml(content(req, as = "text", encoding = "UTF-8")) | |
ns <- xml_ns_rename(xml_ns(rc), d1 = "feed") | |
# Parsing jennybc's xml with xml_to_df | |
x <- xml_to_df(rc, ns) | |
x$`feed:entry`[[1]] %>% | |
select(starts_with("gsx:")) %>% | |
map(bind_rows, .id = "row") %>% | |
bind_rows | |
#> Source: local data frame [23 x 3] | |
#> | |
#> row .name .value | |
#> (chr) (chr) (chr) | |
#> 1 1 gsx:_chk2m Americas | |
#> 2 3 gsx:_chk2m Americas | |
#> 3 4 gsx:_chk2m Americas | |
#> 4 1 gsx:country Argentina | |
#> 5 2 gsx:country Argentina | |
#> 6 3 gsx:country | |
#> 7 4 gsx:country Argentina | |
#> 8 1 gsx:gdppercap 5911.315053 | |
#> 9 2 gsx:gdppercap 6856.856212 | |
#> 10 3 gsx:gdppercap 7133.166023 | |
#> .. ... ... ... | |
``` |
@jennybc Your parsing code can now be reduced somewhat using xml_to_df
(see gist for full code):
x <- xml_to_df(rc, ns)
x$`feed:entry`[[1]] %>%
select(starts_with("gsx:")) %>%
map(bind_rows, .id = "row") %>%
bind_rows
#> Source: local data frame [23 x 3]
#>
#> row .name .value
#> (chr) (chr) (chr)
#> 1 1 gsx:_chk2m Americas
#> 2 3 gsx:_chk2m Americas
#> 3 4 gsx:_chk2m Americas
#> 4 1 gsx:country Argentina
#> 5 2 gsx:country Argentina
#> 6 3 gsx:country
#> 7 4 gsx:country Argentina
#> 8 1 gsx:gdppercap 5911.315053
#> 9 2 gsx:gdppercap 6856.856212
#> 10 3 gsx:gdppercap 7133.166023
#> .. ... ... ...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for your comments!
library(xml2)
added.You make good point about data at the top level vs deeper. An earlier draft added the data at every level, rather than only leaf nodes:
But the trouble is that
libxml2
, whichxml2
andXML
both depend on, returns all the data in a node, including child nodes, so in this case the root node would have "ToveJaniReminder...I will not". In some cases that's useful (e.g. parsing in-cell-formatted text in xlsx), but more often it's a pain.Another hitch is that
unnest
only copes with columns of identically-structured data frames, so NULLs scupper it.One way to handle namespaces is to use them as part of the names, which
xml2
already does. I've updated the code accordingly.