Skip to content

Instantly share code, notes, and snippets.

@patperu
Last active April 3, 2017 14:54
Show Gist options
  • Save patperu/0ba473a4df164499465fd356b9bfd7cc to your computer and use it in GitHub Desktop.
Save patperu/0ba473a4df164499465fd356b9bfd7cc to your computer and use it in GitHub Desktop.
library(xml2)
library(dplyr)
library(purrr)
library(tibble)
temp <- tempfile(fileext = ".xml")
download.file("https://www.dropbox.com/s/en2697k57o79gw8/VIS_DATASET_AMBIENTE_09_1999.xml?raw=1", temp)
doc <- read_xml(temp)
unlink(temp)
# http://stackoverflow.com/a/40801799
(mbt1 <- microbenchmark::microbenchmark(
t1 <- xml_find_all(doc, ".//DATA_RECORD") %>%
map(xml_children) %>%
map(~as_tibble(t(set_names(xml_text(.), xml_name(.))))) %>%
bind_rows(), times = 1L))
#Unit: seconds
# expr min lq mean median uq max neval
# ... 66.23821 66.23821 66.23821 66.23821 66.23821 66.23821 1
t1
#> t1
## A tibble: 51,310 × 10
# STATUS_STADIO VERIFICA STATUS MEDIA ORA DATA UDM PARAMETRO STAZIONE ID
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375
#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376
#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377
(mbt2 <- microbenchmark::microbenchmark({
t2 <- xml_children(xml_children(doc)) %>% xml_text()
t2 <- as_tibble(matrix(t2, ncol = 10, byrow = TRUE))
},
times = 1L))
#Unit: seconds
# expr min lq mean median uq max neval
# ... 34.26765 34.26765 34.26765 34.26765 34.26765 34.26765 1
t2
#> t2
## A tibble: 51,310 × 10
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375
#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376
#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377
# Parsing one XML file takes the same time as it does to convert all XML files to CSV via `XML2CSVGenericConverter_V1.0.0`...
@jimhester
Copy link

library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats
library(xml2)

temp <- tempfile(fileext = ".xml")
download.file("https://www.dropbox.com/s/en2697k57o79gw8/VIS_DATASET_AMBIENTE_09_1999.xml?raw=1", temp)
doc <- read_xml(temp)

f1 <- function(doc) {
  xml_find_all(doc, ".//DATA_RECORD") %>%
    map(xml_children) %>%
    map(~as_tibble(t(set_names(xml_text(.), xml_name(.))))) %>%
    bind_rows()
}

f2 <- function(doc) {
  names <- xml_find_all(doc, "//DATA_RECORD[1]/*") %>% xml_name()
  t2 <- xml_children(xml_children(doc)) %>% xml_text()
  m <- matrix(t2, ncol = 10, byrow = TRUE)
  colnames(m) <- names
  as_tibble(m)
}

f3 <- function(doc) {
  names <- xml_find_all(doc, "//DATA_RECORD[1]/*") %>% xml_name()
  values <- xml_find_all(doc, "//DATA_RECORD/*/text()") %>% xml_text()
  m <- matrix(values, ncol = length(names), byrow = TRUE)
  colnames(m) <- names
  as_tibble(m)
}

f4 <- function(doc) {
  names <- xml_find_all(doc, "//DATA_RECORD[1]/*") %>% xml_name()
  res <- as.vector(length(names), "list")
  for (i in seq_along(names)) {
    res[[i]] <- xml_find_all(doc, paste0("//DATA_RECORD/*[", i, "]/text()")) %>% xml_text()
  }
  names(res) <- names
  as_tibble(res)
}

(bench <- microbenchmark::microbenchmark(
    bind = { t1 <- f1(doc) },
    children = { t2 <- f2(doc) },
    text_only = { t3 <- f3(doc) },
    direct = { t4 <- f4(doc) },
    times = 1L))
#> Unit: seconds
#>       expr      min       lq     mean   median       uq      max neval
#>       bind 36.77058 36.77058 36.77058 36.77058 36.77058 36.77058     1
#>   children 29.81601 29.81601 29.81601 29.81601 29.81601 29.81601     1
#>  text_only 15.10593 15.10593 15.10593 15.10593 15.10593 15.10593     1
#>     direct 13.59607 13.59607 13.59607 13.59607 13.59607 13.59607     1

all.equal(t1, t2)
#> [1] TRUE
all.equal(t1, t3)
#> [1] TRUE
all.equal(t1, t4)
#> [1] TRUE

t1
#> # A tibble: 51,310 × 10
#>    STATUS_STADIO    VERIFICA STATUS       MEDIA   ORA      DATA   UDM
#>            <chr>       <chr>  <chr>       <chr> <chr>     <chr> <chr>
#> 1              1 DATO VALIDO      0 1,401000023     1 01-SET-99   ppm
#> 2              1 DATO VALIDO      0 1,422000051     2 01-SET-99   ppm
#> 3              1 DATO VALIDO      0 1,444000006     3 01-SET-99   ppm
#> 4              1 DATO VALIDO      0 1,422999978     4 01-SET-99   ppm
#> 5              1 DATO VALIDO      0  1,43900001     5 01-SET-99   ppm
#> 6              1 DATO VALIDO      0 1,432999969     6 01-SET-99   ppm
#> 7              1 DATO VALIDO      0 1,440000057     7 01-SET-99   ppm
#> 8              1 DATO VALIDO      0 1,447000027     8 01-SET-99   ppm
#> 9              1 DATO VALIDO      0 1,432000041     9 01-SET-99   ppm
#> 10             1 DATO VALIDO      0  1,47300005    10 01-SET-99   ppm
#> # ... with 51,300 more rows, and 3 more variables: PARAMETRO <chr>,
#> #   STAZIONE <chr>, ID <chr>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment