GuillaumePressiat · February 20, 2018 20:07
diff --git a/xml_to_df.R b/xml_to_df.R
 ``` r
 library(xml2)
 library(dplyr)
 library(purrr)
 library(stringr)

 # From the root node:
 # If has_children, then recurse.
 # Otherwise, attributes, value and children (nested) to data frame.

 xml_to_df <- function(doc, ns = xml_ns(doc)) {
  node_to_df <- function(node) {
    # Filter the attributes for ones that aren't namespaces
    # x <- list(.index = 0, .name = xml_name(node, ns))
    x <- list(.name = xml_name(node, ns))
    # Attributes as column headers, and their values in the first row
    attrs <- xml_attrs(node)
    if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]}
    if (length(attrs) > 0) {x <- c(x, attrs)}
    # Build data frame manually, to avoid as.data.frame's good intentions
    children <- xml_children(node)
    if (length(children) >= 1) {
      x <- 
        children %>%
        # Recurse here
        map(node_to_df) %>%
        split_by(".name") %>%
        map(bind_rows) %>%
        map(list) %>%
        {c(x, .)}
      attr(x, "row.names") <- 1L
      class(x) <- c("tbl_df", "data.frame")
    } else {
      x$.value <- xml_text(node)
    }
    x
  }
  node_to_df(doc)
 }


 xml <- "
 <messages>
  <note id=\"501\">
    <to>Tove</to>
    <from>Jani</from>
    <heading>Reminder</heading>
    <body>Don't forget me this weekend!</body>
  </note>
  <note id=\"502\">
    <to>Jani</to>
    <from>Tove</from>
    <heading>Re: Reminder</heading>
    <body>I will not</body>
  </note>
 </messages>"

 xml_df <- xml_to_df(read_xml(xml))
 xml_df
 #> Source: local data frame [1 x 2]
 #> 
 #>      .name           note
 #>      (chr)          (chr)
 #> 1 messages <tbl_df [2,6]>

 xml_df$note
 #> [[1]]
 #> Source: local data frame [2 x 6]
 #> 
 #>   .name    id           body           from        heading             to
 #>   (chr) (chr)          (chr)          (chr)          (chr)          (chr)
 #> 1  note   501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
 #> 2  note   502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>

 tidyr::unnest(xml_df$note[[1]], body)
 #> Source: local data frame [2 x 7]
 #> 
 #>   .name    id           from        heading             to .name
 #>   (chr) (chr)          (chr)          (chr)          (chr) (chr)
 #> 1  note   501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>  body
 #> 2  note   502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>  body
 #> Variables not shown: .value (chr)

 x <- read_xml('
 <root>
   <doc1 xmlns = "http://foo.com"><baz /></doc1>
   <doc2 xmlns = "http://bar.com"><baz /></doc2>
 </root>
 ')
 xml_to_df(x)
 #> Source: local data frame [1 x 3]
 #> 
 #>   .name        d1:doc1        d2:doc2
 #>   (chr)          (chr)          (chr)
 #> 1  root <tbl_df [1,2]> <tbl_df [1,2]>

 # jennybc's linked code to import xml from Google sheets
 library(googlesheets)
 library(httr)
 pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full"
 (ss <- gs_ws_feed(pts_ws_feed))
 #>                   Spreadsheet title: test-gs-public-testing-sheet
 #>                  Spreadsheet author: rpackagetest
 #>   Date of googlesheets registration: 2016-06-03 00:02:56 GMT
 #>     Date of last spreadsheet update: 2016-03-11 05:37:48 GMT
 #>                          visibility: public
 #>                         permissions: rw
 #>                             version: new
 #> 
 #> Contains 8 worksheets:
 #> (Title): (Nominal worksheet extent as rows x columns)
 #> embedded_empty_cells: 8 x 7
 #> special_chars: 1000 x 26
 #> diabolical_column_names: 4 x 8
 #> shipwrecks: 1000 x 26
 #> for_resizing: 1799 x 30
 #> for_updating: 1000 x 26
 #> empty: 1000 x 26
 #> colnames_only: 1000 x 26
 #> 
 #> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8
 #> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/
 col_names <- TRUE
 ws <- "embedded_empty_cells"
 index <- match(ws, ss$ws$ws_title)
 the_url <- ss$ws$listfeed[index]
 req <- GET(the_url)
 rc <- read_xml(content(req, as = "text", encoding = "UTF-8"))
 ns <- xml_ns_rename(xml_ns(rc), d1 = "feed")

 # Parsing jennybc's xml with xml_to_df
 x <- xml_to_df(rc, ns)
 x$`feed:entry`[[1]] %>% 
  select(starts_with("gsx:")) %>%
  map(bind_rows, .id = "row") %>% 
  bind_rows
 #> Source: local data frame [23 x 3]
 #> 
 #>      row         .name      .value
 #>    (chr)         (chr)       (chr)
 #> 1      1    gsx:_chk2m    Americas
 #> 2      3    gsx:_chk2m    Americas
 #> 3      4    gsx:_chk2m    Americas
 #> 4      1   gsx:country   Argentina
 #> 5      2   gsx:country   Argentina
 #> 6      3   gsx:country            
 #> 7      4   gsx:country   Argentina
 #> 8      1 gsx:gdppercap 5911.315053
 #> 9      2 gsx:gdppercap 6856.856212
 #> 10     3 gsx:gdppercap 7133.166023
 #> ..   ...           ...         ...
 ```
	``` r
	library(xml2)
	library(dplyr)
	library(purrr)
	library(stringr)

	# From the root node:
	# If has_children, then recurse.
	# Otherwise, attributes, value and children (nested) to data frame.

	xml_to_df <- function(doc, ns = xml_ns(doc)) {
	node_to_df <- function(node) {
	# Filter the attributes for ones that aren't namespaces
	# x <- list(.index = 0, .name = xml_name(node, ns))
	x <- list(.name = xml_name(node, ns))
	# Attributes as column headers, and their values in the first row
	attrs <- xml_attrs(node)
	if (length(attrs) > 0) {attrs <- attrs[!grepl("xmlns", names(attrs))]}
	if (length(attrs) > 0) {x <- c(x, attrs)}
	# Build data frame manually, to avoid as.data.frame's good intentions
	children <- xml_children(node)
	if (length(children) >= 1) {
	x <-
	children %>%
	# Recurse here
	map(node_to_df) %>%
	split_by(".name") %>%
	map(bind_rows) %>%
	map(list) %>%
	{c(x, .)}
	attr(x, "row.names") <- 1L
	class(x) <- c("tbl_df", "data.frame")
	} else {
	x$.value <- xml_text(node)
	}
	x
	}
	node_to_df(doc)
	}


	xml <- "
	<messages>
	<note id=\"501\">
	<to>Tove</to>
	<from>Jani</from>
	<heading>Reminder</heading>
	<body>Don't forget me this weekend!</body>
	</note>
	<note id=\"502\">
	<to>Jani</to>
	<from>Tove</from>
	<heading>Re: Reminder</heading>
	<body>I will not</body>
	</note>
	</messages>"

	xml_df <- xml_to_df(read_xml(xml))
	xml_df
	#> Source: local data frame [1 x 2]
	#>
	#> .name note
	#> (chr) (chr)
	#> 1 messages <tbl_df [2,6]>

	xml_df$note
	#> [[1]]
	#> Source: local data frame [2 x 6]
	#>
	#> .name id body from heading to
	#> (chr) (chr) (chr) (chr) (chr) (chr)
	#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>
	#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]>

	tidyr::unnest(xml_df$note[[1]], body)
	#> Source: local data frame [2 x 7]
	#>
	#> .name id from heading to .name
	#> (chr) (chr) (chr) (chr) (chr) (chr)
	#> 1 note 501 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
	#> 2 note 502 <tbl_df [1,2]> <tbl_df [1,2]> <tbl_df [1,2]> body
	#> Variables not shown: .value (chr)

	x <- read_xml('
	<root>
	<doc1 xmlns = "http://foo.com"><baz /></doc1>
	<doc2 xmlns = "http://bar.com"><baz /></doc2>
	</root>
	')
	xml_to_df(x)
	#> Source: local data frame [1 x 3]
	#>
	#> .name d1:doc1 d2:doc2
	#> (chr) (chr) (chr)
	#> 1 root <tbl_df [1,2]> <tbl_df [1,2]>

	# jennybc's linked code to import xml from Google sheets
	library(googlesheets)
	library(httr)
	pts_ws_feed <- "https://spreadsheets.google.com/feeds/worksheets/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/public/full"
	(ss <- gs_ws_feed(pts_ws_feed))
	#> Spreadsheet title: test-gs-public-testing-sheet
	#> Spreadsheet author: rpackagetest
	#> Date of googlesheets registration: 2016-06-03 00:02:56 GMT
	#> Date of last spreadsheet update: 2016-03-11 05:37:48 GMT
	#> visibility: public
	#> permissions: rw
	#> version: new
	#>
	#> Contains 8 worksheets:
	#> (Title): (Nominal worksheet extent as rows x columns)
	#> embedded_empty_cells: 8 x 7
	#> special_chars: 1000 x 26
	#> diabolical_column_names: 4 x 8
	#> shipwrecks: 1000 x 26
	#> for_resizing: 1799 x 30
	#> for_updating: 1000 x 26
	#> empty: 1000 x 26
	#> colnames_only: 1000 x 26
	#>
	#> Key: 1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8
	#> Browser URL: https://docs.google.com/spreadsheets/d/1amnxLg9VVDoE6KSIZvutYkEGNgQyJSnLJgHthehruy8/
	col_names <- TRUE
	ws <- "embedded_empty_cells"
	index <- match(ws, ss$ws$ws_title)
	the_url <- ss$ws$listfeed[index]
	req <- GET(the_url)
	rc <- read_xml(content(req, as = "text", encoding = "UTF-8"))
	ns <- xml_ns_rename(xml_ns(rc), d1 = "feed")

	# Parsing jennybc's xml with xml_to_df
	x <- xml_to_df(rc, ns)
	x$`feed:entry`[[1]] %>%
	select(starts_with("gsx:")) %>%
	map(bind_rows, .id = "row") %>%
	bind_rows
	#> Source: local data frame [23 x 3]
	#>
	#> row .name .value
	#> (chr) (chr) (chr)
	#> 1 1 gsx:_chk2m Americas
	#> 2 3 gsx:_chk2m Americas
	#> 3 4 gsx:_chk2m Americas
	#> 4 1 gsx:country Argentina
	#> 5 2 gsx:country Argentina
	#> 6 3 gsx:country
	#> 7 4 gsx:country Argentina
	#> 8 1 gsx:gdppercap 5911.315053
	#> 9 2 gsx:gdppercap 6856.856212
	#> 10 3 gsx:gdppercap 7133.166023
	#> .. ... ... ...
	```