patperu · April 3, 2017 14:54 · jimhester · Apr 3, 2017
diff --git a/xml2tibble.R b/xml2tibble.R
 library(xml2)
 library(dplyr)
 library(purrr)
 library(tibble)

 temp <- tempfile(fileext = ".xml")
 download.file("https://www.dropbox.com/s/en2697k57o79gw8/VIS_DATASET_AMBIENTE_09_1999.xml?raw=1", temp)
 doc <- read_xml(temp)
 unlink(temp)

 # http://stackoverflow.com/a/40801799
 (mbt1 <- microbenchmark::microbenchmark(
      t1 <- xml_find_all(doc, ".//DATA_RECORD") %>%
              map(xml_children) %>%
              map(~as_tibble(t(set_names(xml_text(.), xml_name(.))))) %>%
              bind_rows(), times = 1L))

 #Unit: seconds
 # expr      min       lq     mean   median       uq      max neval
 #  ... 66.23821 66.23821 66.23821 66.23821 66.23821 66.23821     1

 t1
 #> t1
 ## A tibble: 51,310 × 10
 #   STATUS_STADIO    VERIFICA STATUS       MEDIA   ORA      DATA   UDM PARAMETRO STAZIONE      ID
 #           <chr>       <chr>  <chr>       <chr> <chr>     <chr> <chr>     <chr>    <chr>   <chr>
 #1              1 DATO VALIDO      0 1,401000023     1 01-SET-99   ppm       CH4   Belgio 1503375
 #2              1 DATO VALIDO      0 1,422000051     2 01-SET-99   ppm       CH4   Belgio 1503376
 #3              1 DATO VALIDO      0 1,444000006     3 01-SET-99   ppm       CH4   Belgio 1503377

 (mbt2 <- microbenchmark::microbenchmark({
         t2 <- xml_children(xml_children(doc)) %>% xml_text()
         t2 <- as_tibble(matrix(t2, ncol = 10, byrow = TRUE))
         },
         times = 1L))

 #Unit: seconds
 # expr      min       lq     mean   median       uq      max neval
 # ...  34.26765 34.26765 34.26765 34.26765 34.26765 34.26765     1

 t2

 #> t2
 ## A tibble: 51,310 × 10
 #      V1          V2    V3          V4    V5        V6    V7    V8     V9     V10
 #   <chr>       <chr> <chr>       <chr> <chr>     <chr> <chr> <chr>  <chr>   <chr>
 #1      1 DATO VALIDO     0 1,401000023     1 01-SET-99   ppm   CH4 Belgio 1503375
 #2      1 DATO VALIDO     0 1,422000051     2 01-SET-99   ppm   CH4 Belgio 1503376
 #3      1 DATO VALIDO     0 1,444000006     3 01-SET-99   ppm   CH4 Belgio 1503377

 # Parsing one XML file takes the same time as it does to convert all XML files to CSV via `XML2CSVGenericConverter_V1.0.0`...
	library(xml2)
	library(dplyr)
	library(purrr)
	library(tibble)

	temp <- tempfile(fileext = ".xml")
	download.file("https://www.dropbox.com/s/en2697k57o79gw8/VIS_DATASET_AMBIENTE_09_1999.xml?raw=1", temp)
	doc <- read_xml(temp)
	unlink(temp)

	# http://stackoverflow.com/a/40801799
	(mbt1 <- microbenchmark::microbenchmark(
	t1 <- xml_find_all(doc, ".//DATA_RECORD") %>%
	map(xml_children) %>%
	map(~as_tibble(t(set_names(xml_text(.), xml_name(.))))) %>%
	bind_rows(), times = 1L))

	#Unit: seconds
	# expr min lq mean median uq max neval
	# ... 66.23821 66.23821 66.23821 66.23821 66.23821 66.23821 1

	t1
	#> t1
	## A tibble: 51,310 × 10
	# STATUS_STADIO VERIFICA STATUS MEDIA ORA DATA UDM PARAMETRO STAZIONE ID
	# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
	#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375
	#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376
	#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377

	(mbt2 <- microbenchmark::microbenchmark({
	t2 <- xml_children(xml_children(doc)) %>% xml_text()
	t2 <- as_tibble(matrix(t2, ncol = 10, byrow = TRUE))
	},
	times = 1L))

	#Unit: seconds
	# expr min lq mean median uq max neval
	# ... 34.26765 34.26765 34.26765 34.26765 34.26765 34.26765 1

	t2

	#> t2
	## A tibble: 51,310 × 10
	# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
	# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
	#1 1 DATO VALIDO 0 1,401000023 1 01-SET-99 ppm CH4 Belgio 1503375
	#2 1 DATO VALIDO 0 1,422000051 2 01-SET-99 ppm CH4 Belgio 1503376
	#3 1 DATO VALIDO 0 1,444000006 3 01-SET-99 ppm CH4 Belgio 1503377

	# Parsing one XML file takes the same time as it does to convert all XML files to CSV via `XML2CSVGenericConverter_V1.0.0`...