bhive01 · May 20, 2016 19:56
diff --git a/fileIOinR.R b/fileIOinR.R
 library(devtools)
 library(iotools)
 library(R.utils)
 library(feather) # install_github("wesm/feather/R")
 library(microbenchmark)
 library(data.table)
 library(readr)
 library(ggplot2)
 library(plotly)


 testreps <- 25

 set.seed(3-29-16)

 rows <- 1000000

 x <- data.frame(ints = round(runif(rows, -100, 100)), stringsAsFactors = FALSE)
 x$floats <- runif(rows, -100, 100)
 x$bools <- sample(c(TRUE, FALSE), rows, replace = TRUE)
 x$dates <- as.POSIXct(runif(rows, 100000000, 1459293171), origin = "1970-01-01")
 x$categories <- as.factor(sample(c(LETTERS, 0:9), rows, replace = TRUE))
 x$strings <- replicate(rows, paste0(sample(letters, sample(1:10, 1), replace = TRUE), collapse = ""))

 namevector <- c("baseCSV", "baseCSVgz", "iotoolsCSV", "readrCSV", "readrCSVgz", "DTCSV", "DTCSVgz", "baseRDA", "baseRDS", "readrRDS", "readrRDSgz", "feather")

 funcbasegz <- function(df, filename) {
  write.csv(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }

 funcreadrgz <- function(df, filename) {
  write_csv(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }

 funcDTgz <- function(df, filename) {
  fwrite(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }

 writebenches <-
 microbenchmark(
  baseCSV = write.csv(x, file = "x.csv"), 
  baseCSVgz = funcbasegz(x, file = "x.gz.csv"),
  iotoolsCSV = write.csv.raw(x, file="x.iotools.csv"),
  readrCSV = write_csv(x, "x.readr.csv"),
  readrCSVgz = funcreadrgz(x, "x.gz.readr.csv"),
  DTCSV =fwrite(x, "x.DT.csv"),
  DTCSVgz = funcDTgz(x, "x.gz.DT.csv"),
  baseRDA = save(x, file = "x.rda"),
  baseRDS = saveRDS(x, file = "x.rds"),
  readrRDS = write_rds(x, "x.readr.rds", compress="none"),
  readrRDSgz = write_rds(x, "x.gz.readr.rds", compress="gz"),
  feather = write_feather(x, "x.feather"),
  times = testreps
 )
 # plot output
 writeSpeed <- 
  autoplot(writebenches) + 
    labs(title = "Write Benchmarks") +
    scale_x_discrete(limits = rev(namevector))

 ggplotly(writeSpeed)

 #remove dataframe from memory
 rm(x, rows)

 #check file sizes
 #check file sizes
 filesize <-
 data.frame(name= namevector
         , size = c(file.size("x.csv"),file.size("x.gz.csv.gz"),file.size("x.iotools.csv"),file.size("x.readr.csv"),file.size("x.gz.readr.csv.gz"),file.size("x.DT.csv"),file.size("x.gz.DT.csv.gz"),file.size("x.rda"),file.size("x.rds"),file.size("x.readr.rds"),file.size("x.gz.readr.rds"),file.size("x.feather")))

 fileSize <-
  ggplot(filesize, aes(x=name, y=size)) +
    geom_point() +
    scale_x_discrete(limits = rev(namevector)) +
    #theme(axis.text.x = element_text(angle=270)) + #plotly still cuts off axes
    labs(x="", y="File Size (bytes)")
 ggplotly(fileSize)

 readbenches <-
 microbenchmark(
  baseCSV = assign("y", read.csv("x.csv")), 
  baseCSVgz = assign("y", read.csv("x.gz.csv.gz")),
  iotoolsCSV = assign("y", read.csv.raw("x.iotools.csv")),
  readrCSV = assign("y", read_csv("x.readr.csv")),
  readrCSVgz = assign("y", read_csv("x.gz.readr.csv.gz")),
  DTCSV = assign("y", fread("x.DT.csv")),
  DTCSVgz = assign("y", fread(gunzip("x.gz.DT.csv.gz", remove = FALSE, overwrite = TRUE))),
  baseRDA = load("x.rda"),
  baseRDS = readRDS("x.rds"),
  readrRDS = read_rds("x.readr.rds"),
  readrRDSgz = read_rds("x.gz.readr.rds"),
  feather = assign("y", read_feather("x.feather")),
  times = testreps
 )

 # plot output
 readSpeed <-
  autoplot(readbenches) + 
    labs(title = "Read Benchmarks") +
    scale_x_discrete(limits = rev(namevector))
 ggplotly(readSpeed)

 session_info()
diff --git a/fileIOinR.Rmd b/fileIOinR.Rmd
 ---
 title: "fileIOinR"
 author: "Brandon Hurr"
 date: "May 6, 2016"
 output: html_document
 ---

 ```{r setup, include=FALSE}
 library(devtools)
 library(iotools)
 library(R.utils)
 library(feather) # install_github("wesm/feather/R")
 library(microbenchmark)
 library(data.table)
 library(readr)
 library(ggplot2)
 library(plotly)

 testreps <- 25

 set.seed(3-29-16)

 rows <- 1000000

 x <- data.frame(ints = round(runif(rows, -100, 100)), stringsAsFactors = FALSE)
 x$floats <- runif(rows, -100, 100)
 x$bools <- sample(c(TRUE, FALSE), rows, replace = TRUE)
 x$dates <- as.POSIXct(runif(rows, 100000000, 1459293171), origin = "1970-01-01")
 x$categories <- as.factor(sample(c(LETTERS, 0:9), rows, replace = TRUE))
 x$strings <- replicate(rows, paste0(sample(letters, sample(1:10, 1), replace = TRUE), collapse = ""))

 namevector <- c("baseCSV", "baseCSVgz", "iotoolsCSV", "readrCSV", "readrCSVgz", "DTCSV", "DTCSVgz", "baseRDA", "baseRDS", "readrRDS", "readrRDSgz", "feather")

 funcbasegz <- function(df, filename) {
  write.csv(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }

 funcreadrgz <- function(df, filename) {
  write_csv(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }

 funcDTgz <- function(df, filename) {
  fwrite(df, filename)
  R.utils::gzip(filename, overwrite=TRUE)
 }
 ```

 ## File IO in R

 The purpose of this document is to examine the File IO speed and file size of commonly used file IO functions. It includes base R (CSV, RDA, RDS), readr, data.table, and feather. The script was written and run on a iMac (3.2 Ghz 4 core 8 hyperthread i7 4770 processor) with a Crucial MX 200 SSD and 16 GB of RAM.The codebase was borrowed directly from [@rmflight](https://mobile.twitter.com/rmflight) 's [gist](https://gist.github.com/rmflight/53a93424f00b83a907d0d79ad5557d38) and edited for further comparison and testing. Many thanks to [@arun_sriniv](https://mobile.twitter.com/arun_sriniv) for helping me [get openMP running](https://github.com/Rdatatable/data.table/issues/1692). 

 ### Writing Files and File Size
 Zipping files when saving will save space, but not as much space as RDS/RDA+zipping does and comes at the cost of time. If you have disk space saving in the new feather format is the fastest. Writing to CSV with data.table is extremely fast now, but does come at the expense of file size. base::write.csv() produced files ~20% larger than data.table and readr::write_csv(). 

 ### Reading Files
 Feather was the fastest (on average) at reading the feather formatted data back into R, but it was also the most inconsistent. Second was the RDS format. As with writing, adding compression to cut file size down significally increased the time required to import. data.table was faster than readr, but neither were as fast as RDA/RDS files. 

 ### Overall
 If you can spare some disk space and the speed of feather is useful to you, it seems like the way to go for now. If disk space is a premium and you don't mind waiting a little longer then a compressed RDS/RDA file makes the most sense. If you want smaller size and portability, then compressing a csv makes a lot of sense, but is one of the slowest to read in. Base functions for reading/writing CSV files are clearly outpaced by these other methods. 

 ## Write Speed
 ```{r write, echo=FALSE, message = FALSE, warning = FALSE, error=FALSE}
 writebenches <-
 microbenchmark(
  baseCSV = write.csv(x, file = "x.csv"), 
  baseCSVgz = funcbasegz(x, file = "x.gz.csv"),
  iotoolsCSV = write.csv.raw(x, file="x.iotools.csv"),
  readrCSV = write_csv(x, "x.readr.csv"),
  readrCSVgz = funcreadrgz(x, "x.gz.readr.csv"),
  DTCSV =fwrite(x, "x.DT.csv"),
  DTCSVgz = funcDTgz(x, "x.gz.DT.csv"),
  baseRDA = save(x, file = "x.rda"),
  baseRDS = saveRDS(x, file = "x.rds"),
  readrRDS = write_rds(x, "x.readr.rds", compress="none"),
  readrRDSgz = write_rds(x, "x.gz.readr.rds", compress="gz"),
  feather = write_feather(x, "x.feather"),
  times = testreps
 )

 # plot output
 writeSpeed <- 
  autoplot(writebenches) + 
    labs(title = "Write Benchmarks") +
    scale_x_discrete(limits = rev(namevector))

 ggplotly(writeSpeed)
 ```

 ## File Size
 ```{r size, echo=FALSE, message = FALSE, warning = FALSE}
 #check file sizes
 filesize <-
 data.frame(name= namevector
         , size = c(file.size("x.csv"),file.size("x.gz.csv.gz"),file.size("x.iotools.csv"),file.size("x.readr.csv"),file.size("x.gz.readr.csv.gz"),file.size("x.DT.csv"),file.size("x.gz.DT.csv.gz"),file.size("x.rda"),file.size("x.rds"),file.size("x.readr.rds"),file.size("x.gz.readr.rds"),file.size("x.feather")))

 fileSize <-
  ggplot(filesize, aes(x=name, y=size)) +
    geom_point() +
    scale_x_discrete(limits = rev(namevector)) +
    #theme(axis.text.x = element_text(angle=270)) + #plotly still cuts off axes
    labs(x="", y="File Size (bytes)")
 ggplotly(fileSize)
 ```


 ## Read Speed
 ```{r read, echo=FALSE, message = FALSE, warning = FALSE}
 readbenches <-
 microbenchmark(
  baseCSV = assign("y", read.csv("x.csv")), 
  baseCSVgz = assign("y", read.csv("x.gz.csv.gz")),
  iotoolsCSV = assign("y", read.csv.raw("x.iotools.csv")),
  readrCSV = assign("y", read_csv("x.readr.csv")),
  readrCSVgz = assign("y", read_csv("x.gz.readr.csv.gz")),
  DTCSV = assign("y", fread("x.DT.csv")),
  DTCSVgz = assign("y", fread(gunzip("x.gz.DT.csv.gz", remove = FALSE, overwrite = TRUE))),
  baseRDA = load("x.rda"),
  baseRDS = readRDS("x.rds"),
  readrRDS = read_rds("x.readr.rds"),
  readrRDSgz = read_rds("x.gz.readr.rds"),
  feather = assign("y", read_feather("x.feather")),
  times = testreps
 )

 # plot output
 readSpeed <-
  autoplot(readbenches) + 
    labs(title = "Read Benchmarks") +
    scale_x_discrete(limits = rev(namevector))
 ggplotly(readSpeed)
 ```

 ``` {r session}
 session_info()
 ```
	library(devtools)
	library(iotools)
	library(R.utils)
	library(feather) # install_github("wesm/feather/R")
	library(microbenchmark)
	library(data.table)
	library(readr)
	library(ggplot2)
	library(plotly)


	testreps <- 25

	set.seed(3-29-16)

	rows <- 1000000

	x <- data.frame(ints = round(runif(rows, -100, 100)), stringsAsFactors = FALSE)
	x$floats <- runif(rows, -100, 100)
	x$bools <- sample(c(TRUE, FALSE), rows, replace = TRUE)
	x$dates <- as.POSIXct(runif(rows, 100000000, 1459293171), origin = "1970-01-01")
	x$categories <- as.factor(sample(c(LETTERS, 0:9), rows, replace = TRUE))
	x$strings <- replicate(rows, paste0(sample(letters, sample(1:10, 1), replace = TRUE), collapse = ""))

	namevector <- c("baseCSV", "baseCSVgz", "iotoolsCSV", "readrCSV", "readrCSVgz", "DTCSV", "DTCSVgz", "baseRDA", "baseRDS", "readrRDS", "readrRDSgz", "feather")

	funcbasegz <- function(df, filename) {
	write.csv(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}

	funcreadrgz <- function(df, filename) {
	write_csv(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}

	funcDTgz <- function(df, filename) {
	fwrite(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}

	writebenches <-
	microbenchmark(
	baseCSV = write.csv(x, file = "x.csv"),
	baseCSVgz = funcbasegz(x, file = "x.gz.csv"),
	iotoolsCSV = write.csv.raw(x, file="x.iotools.csv"),
	readrCSV = write_csv(x, "x.readr.csv"),
	readrCSVgz = funcreadrgz(x, "x.gz.readr.csv"),
	DTCSV =fwrite(x, "x.DT.csv"),
	DTCSVgz = funcDTgz(x, "x.gz.DT.csv"),
	baseRDA = save(x, file = "x.rda"),
	baseRDS = saveRDS(x, file = "x.rds"),
	readrRDS = write_rds(x, "x.readr.rds", compress="none"),
	readrRDSgz = write_rds(x, "x.gz.readr.rds", compress="gz"),
	feather = write_feather(x, "x.feather"),
	times = testreps
	)
	# plot output
	writeSpeed <-
	autoplot(writebenches) +
	labs(title = "Write Benchmarks") +
	scale_x_discrete(limits = rev(namevector))

	ggplotly(writeSpeed)

	#remove dataframe from memory
	rm(x, rows)

	#check file sizes
	#check file sizes
	filesize <-
	data.frame(name= namevector
	, size = c(file.size("x.csv"),file.size("x.gz.csv.gz"),file.size("x.iotools.csv"),file.size("x.readr.csv"),file.size("x.gz.readr.csv.gz"),file.size("x.DT.csv"),file.size("x.gz.DT.csv.gz"),file.size("x.rda"),file.size("x.rds"),file.size("x.readr.rds"),file.size("x.gz.readr.rds"),file.size("x.feather")))

	fileSize <-
	ggplot(filesize, aes(x=name, y=size)) +
	geom_point() +
	scale_x_discrete(limits = rev(namevector)) +
	#theme(axis.text.x = element_text(angle=270)) + #plotly still cuts off axes
	labs(x="", y="File Size (bytes)")
	ggplotly(fileSize)

	readbenches <-
	microbenchmark(
	baseCSV = assign("y", read.csv("x.csv")),
	baseCSVgz = assign("y", read.csv("x.gz.csv.gz")),
	iotoolsCSV = assign("y", read.csv.raw("x.iotools.csv")),
	readrCSV = assign("y", read_csv("x.readr.csv")),
	readrCSVgz = assign("y", read_csv("x.gz.readr.csv.gz")),
	DTCSV = assign("y", fread("x.DT.csv")),
	DTCSVgz = assign("y", fread(gunzip("x.gz.DT.csv.gz", remove = FALSE, overwrite = TRUE))),
	baseRDA = load("x.rda"),
	baseRDS = readRDS("x.rds"),
	readrRDS = read_rds("x.readr.rds"),
	readrRDSgz = read_rds("x.gz.readr.rds"),
	feather = assign("y", read_feather("x.feather")),
	times = testreps
	)

	# plot output
	readSpeed <-
	autoplot(readbenches) +
	labs(title = "Read Benchmarks") +
	scale_x_discrete(limits = rev(namevector))
	ggplotly(readSpeed)

	session_info()
	---
	title: "fileIOinR"
	author: "Brandon Hurr"
	date: "May 6, 2016"
	output: html_document
	---

	```{r setup, include=FALSE}
	library(devtools)
	library(iotools)
	library(R.utils)
	library(feather) # install_github("wesm/feather/R")
	library(microbenchmark)
	library(data.table)
	library(readr)
	library(ggplot2)
	library(plotly)

	testreps <- 25

	set.seed(3-29-16)

	rows <- 1000000

	x <- data.frame(ints = round(runif(rows, -100, 100)), stringsAsFactors = FALSE)
	x$floats <- runif(rows, -100, 100)
	x$bools <- sample(c(TRUE, FALSE), rows, replace = TRUE)
	x$dates <- as.POSIXct(runif(rows, 100000000, 1459293171), origin = "1970-01-01")
	x$categories <- as.factor(sample(c(LETTERS, 0:9), rows, replace = TRUE))
	x$strings <- replicate(rows, paste0(sample(letters, sample(1:10, 1), replace = TRUE), collapse = ""))

	namevector <- c("baseCSV", "baseCSVgz", "iotoolsCSV", "readrCSV", "readrCSVgz", "DTCSV", "DTCSVgz", "baseRDA", "baseRDS", "readrRDS", "readrRDSgz", "feather")

	funcbasegz <- function(df, filename) {
	write.csv(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}

	funcreadrgz <- function(df, filename) {
	write_csv(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}

	funcDTgz <- function(df, filename) {
	fwrite(df, filename)
	R.utils::gzip(filename, overwrite=TRUE)
	}
	```

	## File IO in R

	The purpose of this document is to examine the File IO speed and file size of commonly used file IO functions. It includes base R (CSV, RDA, RDS), readr, data.table, and feather. The script was written and run on a iMac (3.2 Ghz 4 core 8 hyperthread i7 4770 processor) with a Crucial MX 200 SSD and 16 GB of RAM.The codebase was borrowed directly from [@rmflight](https://mobile.twitter.com/rmflight) 's [gist](https://gist.github.com/rmflight/53a93424f00b83a907d0d79ad5557d38) and edited for further comparison and testing. Many thanks to [@arun_sriniv](https://mobile.twitter.com/arun_sriniv) for helping me [get openMP running](https://github.com/Rdatatable/data.table/issues/1692).

	### Writing Files and File Size
	Zipping files when saving will save space, but not as much space as RDS/RDA+zipping does and comes at the cost of time. If you have disk space saving in the new feather format is the fastest. Writing to CSV with data.table is extremely fast now, but does come at the expense of file size. base::write.csv() produced files ~20% larger than data.table and readr::write_csv().

	### Reading Files
	Feather was the fastest (on average) at reading the feather formatted data back into R, but it was also the most inconsistent. Second was the RDS format. As with writing, adding compression to cut file size down significally increased the time required to import. data.table was faster than readr, but neither were as fast as RDA/RDS files.

	### Overall
	If you can spare some disk space and the speed of feather is useful to you, it seems like the way to go for now. If disk space is a premium and you don't mind waiting a little longer then a compressed RDS/RDA file makes the most sense. If you want smaller size and portability, then compressing a csv makes a lot of sense, but is one of the slowest to read in. Base functions for reading/writing CSV files are clearly outpaced by these other methods.

	## Write Speed
	```{r write, echo=FALSE, message = FALSE, warning = FALSE, error=FALSE}
	writebenches <-
	microbenchmark(
	baseCSV = write.csv(x, file = "x.csv"),
	baseCSVgz = funcbasegz(x, file = "x.gz.csv"),
	iotoolsCSV = write.csv.raw(x, file="x.iotools.csv"),
	readrCSV = write_csv(x, "x.readr.csv"),
	readrCSVgz = funcreadrgz(x, "x.gz.readr.csv"),
	DTCSV =fwrite(x, "x.DT.csv"),
	DTCSVgz = funcDTgz(x, "x.gz.DT.csv"),
	baseRDA = save(x, file = "x.rda"),
	baseRDS = saveRDS(x, file = "x.rds"),
	readrRDS = write_rds(x, "x.readr.rds", compress="none"),
	readrRDSgz = write_rds(x, "x.gz.readr.rds", compress="gz"),
	feather = write_feather(x, "x.feather"),
	times = testreps
	)

	# plot output
	writeSpeed <-
	autoplot(writebenches) +
	labs(title = "Write Benchmarks") +
	scale_x_discrete(limits = rev(namevector))

	ggplotly(writeSpeed)
	```

	## File Size
	```{r size, echo=FALSE, message = FALSE, warning = FALSE}
	#check file sizes
	filesize <-
	data.frame(name= namevector
	, size = c(file.size("x.csv"),file.size("x.gz.csv.gz"),file.size("x.iotools.csv"),file.size("x.readr.csv"),file.size("x.gz.readr.csv.gz"),file.size("x.DT.csv"),file.size("x.gz.DT.csv.gz"),file.size("x.rda"),file.size("x.rds"),file.size("x.readr.rds"),file.size("x.gz.readr.rds"),file.size("x.feather")))

	fileSize <-
	ggplot(filesize, aes(x=name, y=size)) +
	geom_point() +
	scale_x_discrete(limits = rev(namevector)) +
	#theme(axis.text.x = element_text(angle=270)) + #plotly still cuts off axes
	labs(x="", y="File Size (bytes)")
	ggplotly(fileSize)
	```


	## Read Speed
	```{r read, echo=FALSE, message = FALSE, warning = FALSE}
	readbenches <-
	microbenchmark(
	baseCSV = assign("y", read.csv("x.csv")),
	baseCSVgz = assign("y", read.csv("x.gz.csv.gz")),
	iotoolsCSV = assign("y", read.csv.raw("x.iotools.csv")),
	readrCSV = assign("y", read_csv("x.readr.csv")),
	readrCSVgz = assign("y", read_csv("x.gz.readr.csv.gz")),
	DTCSV = assign("y", fread("x.DT.csv")),
	DTCSVgz = assign("y", fread(gunzip("x.gz.DT.csv.gz", remove = FALSE, overwrite = TRUE))),
	baseRDA = load("x.rda"),
	baseRDS = readRDS("x.rds"),
	readrRDS = read_rds("x.readr.rds"),
	readrRDSgz = read_rds("x.gz.readr.rds"),
	feather = assign("y", read_feather("x.feather")),
	times = testreps
	)

	# plot output
	readSpeed <-
	autoplot(readbenches) +
	labs(title = "Read Benchmarks") +
	scale_x_discrete(limits = rev(namevector))
	ggplotly(readSpeed)
	```

	``` {r session}
	session_info()
	```