Skip to content

Instantly share code, notes, and snippets.

@nacnudus
Last active March 22, 2023 05:01
Show Gist options
  • Save nacnudus/55e601ed7d466b6a22bb36002f23aa64 to your computer and use it in GitHub Desktop.
Save nacnudus/55e601ed7d466b6a22bb36002f23aa64 to your computer and use it in GitHub Desktop.
Convert xml to a nested data frame
``` r
library(xml2)
library(dplyr)
library(purrr)
library(stringr)
# From the root node:
# If has_children, then recurse.
# Otherwise, attributes, value and children (nested) to data frame.
xml_to_df <- function(doc, ns = xml_ns(doc)) {
node_to_df <- function(node) {
# Filter the attributes for ones that aren't namespaces
# x <- list(.index = 0, .name = xml_name(node, ns))
x <- list(.name = xml_name(node, ns))
# Attributes as column headers, and their values in the first row
attrs <- xml_attrs(node)
if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]}
if (length(attrs) > 0) {x <- c(x, attrs)}
# Build data frame manually, to avoid as.data.frame's good intentions
children <- xml_children(node)
if (length(children) >= 1) {
x <-
children %>%
# Recurse here
map(node_to_df) %>%
split_by(".name") %>%
map(bind_rows) %>%
map(list) %>%
{c(x, .)}
attr(x, "row.names") <- 1L
class(x) <- c("tbl_df", "data.frame")
} else {
x$.value <- xml_text(node)
}
x
}
node_to_df(doc)
}
xml <- "
<messages>
<note id=\"501\">
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>
<note id=\"502\">
<to>Jani</to>
<from>Tove</from>
<heading>Re: Reminder</heading>
<body>I will not</body>
</note>
</messages>"
xml_df <- xml_to_df(read_xml(xml))
xml_df
#> Source: local data frame [1 x 2]
#>
#> .name note
#> (chr) (chr)
#> 1 messages <tbl_df [2,6]>
xml_df$note
#> [[1]]
#> Source: local data frame [2 x 6]
#>
#> .name id body from heading to
#> (chr) (chr) (chr) (chr) (chr) (chr)
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
tidyr::unnest(xml_df$note[[1]], body)
#> Source: local data frame [2 x 7]
#>
#> .name id from heading to .name
#> (chr) (chr) (chr) (chr) (chr) (chr)
#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
#> Variables not shown: .value (chr)
x <- read_xml('
<root>
<doc1 xmlns = "http://foo.com"><baz /></doc1>
<doc2 xmlns = "http://bar.com"><baz /></doc2>
</root>
')
xml_to_df(x)
#> Source: local data frame [1 x 3]
#>
#> .name d1:doc1 d2:doc2
#> (chr) (chr) (chr)
#> 1 root <tbl_df [1,2]> <tbl_df [1,2]>
# jennybc's linked code to import xml from Google sheets
library(googlesheets)
library(httr)
pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full"
(ss <- gs_ws_feed(pts_ws_feed))
#> Spreadsheet title: test-gs-public-testing-sheet
#> Spreadsheet author: rpackagetest
#> Date of googlesheets registration: 2016-06-03 00:02:56 GMT
#> Date of last spreadsheet update: 2016-03-11 05:37:48 GMT
#> visibility: public
#> permissions: rw
#> version: new
#>
#> Contains 8 worksheets:
#> (Title): (Nominal worksheet extent as rows x columns)
#> embedded_empty_cells: 8 x 7
#> special_chars: 1000 x 26
#> diabolical_column_names: 4 x 8
#> shipwrecks: 1000 x 26
#> for_resizing: 1799 x 30
#> for_updating: 1000 x 26
#> empty: 1000 x 26
#> colnames_only: 1000 x 26
#>
#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8
#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/
col_names <- TRUE
ws <- "embedded_empty_cells"
index <- match(ws, ss$ws$ws_title)
the_url <- ss$ws$listfeed[index]
req <- GET(the_url)
rc <- read_xml(content(req, as = "text", encoding = "UTF-8"))
ns <- xml_ns_rename(xml_ns(rc), d1 = "feed")
# Parsing jennybc's xml with xml_to_df
x <- xml_to_df(rc, ns)
x$`feed:entry`[[1]] %>%
select(starts_with("gsx:")) %>%
map(bind_rows, .id = "row") %>%
bind_rows
#> Source: local data frame [23 x 3]
#>
#> row .name .value
#> (chr) (chr) (chr)
#> 1 1 gsx:_chk2m Americas
#> 2 3 gsx:_chk2m Americas
#> 3 4 gsx:_chk2m Americas
#> 4 1 gsx:country Argentina
#> 5 2 gsx:country Argentina
#> 6 3 gsx:country
#> 7 4 gsx:country Argentina
#> 8 1 gsx:gdppercap 5911.315053
#> 9 2 gsx:gdppercap 6856.856212
#> 10 3 gsx:gdppercap 7133.166023
#> .. ... ... ...
```
@nacnudus
Copy link
Author

nacnudus commented Jun 3, 2016

@jennybc Your parsing code can now be reduced somewhat using xml_to_df (see gist for full code):

x <- xml_to_df(rc, ns)
x$`feed:entry`[[1]] %>% 
  select(starts_with("gsx:")) %>%
  map(bind_rows, .id = "row") %>% 
  bind_rows
#> Source: local data frame [23 x 3]
#> 
#>      row         .name      .value
#>    (chr)         (chr)       (chr)
#> 1      1    gsx:_chk2m    Americas
#> 2      3    gsx:_chk2m    Americas
#> 3      4    gsx:_chk2m    Americas
#> 4      1   gsx:country   Argentina
#> 5      2   gsx:country   Argentina
#> 6      3   gsx:country            
#> 7      4   gsx:country   Argentina
#> 8      1 gsx:gdppercap 5911.315053
#> 9      2 gsx:gdppercap 6856.856212
#> 10     3 gsx:gdppercap 7133.166023
#> ..   ...           ...         ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment