Last active
April 3, 2017 14:54
-
-
Save patperu/0ba473a4df164499465fd356b9bfd7cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xml2) | |
library(dplyr) | |
library(purrr) | |
library(tibble) | |
temp <- tempfile(fileext = ".xml") | |
download.file("https://www.dropbox.com/s/en2697k57o79gw8/VIS_DATASET_AMBIENTE_09_1999.xml?raw=1", temp) | |
doc <- read_xml(temp) | |
unlink(temp) | |
# http://stackoverflow.com/a/40801799 | |
(mbt1 <- microbenchmark::microbenchmark( | |
t1 <- xml_find_all(doc, ".//DATA_RECORD") %>% | |
map(xml_children) %>% | |
map(~as_tibble(t(set_names(xml_text(.), xml_name(.))))) %>% | |
bind_rows(), times = 1L)) | |
#Unit: seconds | |
# expr min lq mean median uq max neval | |
# ... 66.23821 66.23821 66.23821 66.23821 66.23821 66.23821 1 | |
t1 | |
#> t1 | |
## A tibble: 51,310 × 10 | |
# STATUS_STADIO VERIFICA STATUS MEDIA ORA DATA UDM PARAMETRO STAZIONE ID | |
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> | |
#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375 | |
#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376 | |
#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377 | |
(mbt2 <- microbenchmark::microbenchmark({ | |
t2 <- xml_children(xml_children(doc)) %>% xml_text() | |
t2 <- as_tibble(matrix(t2, ncol = 10, byrow = TRUE)) | |
}, | |
times = 1L)) | |
#Unit: seconds | |
# expr min lq mean median uq max neval | |
# ... 34.26765 34.26765 34.26765 34.26765 34.26765 34.26765 1 | |
t2 | |
#> t2 | |
## A tibble: 51,310 × 10 | |
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 | |
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> | |
#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375 | |
#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376 | |
#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377 | |
# Parsing one XML file takes the same time as it does to convert all XML files to CSV via `XML2CSVGenericConverter_V1.0.0`... |
jimhester
commented
Apr 3, 2017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment