jimhester · December 21, 2015 19:39 · eddelbuettel · Aug 27, 2013
diff --git a/.gitignore b/.gitignore
 .Rproj.user
 .Rhistory
 .RData
 *.Rproj
 *.html
diff --git a/read-file.cpp b/read-file.cpp
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <Rcpp.h>
 using namespace Rcpp;

 // [[Rcpp::export]]
 CharacterVector read_file_cpp(CharacterVector path) {
  std::string fname = as<std::string>(path);
  std::ifstream t(fname.c_str());
  std::stringstream buffer;
  buffer << t.rdbuf();
  return buffer.str();
 }

 // [[Rcpp::export]]
 CharacterVector read_file_cpp2(CharacterVector path) {
  std::string fname = as<std::string>(path);
  std::ifstream in(fname.c_str());
  std::string contents;
  in.seekg(0, std::ios::end);
  contents.resize(in.tellg());
  in.seekg(0, std::ios::beg);
  in.read(&contents[0], contents.size());
  in.close();
  return(contents);
 }
diff --git a/read-file.md b/read-file.md
diff --git a/read-file.rmd b/read-file.rmd
 r``{r, echo = FALSE}
 library(microbenchmark)
 options(digits = 3)
 ```

 # Reading a complete file with R

 This is a short exploration of the most efficient way to read a complete file 
 (including newlines) into R - previously I'd used `readLines()` plus `paste()`
 but that's clearly the least efficient option.

 Here are the options:

 * Use `readLines()` and `paste()`

    ```{r}
    read_file1 <- function(path) {
      paste0(paste0(readLines(path), collapse = "\n"), "\n")
    }
    ```

 * Find out the size of the file and then use `readChar()`

    ```{r}
    read_file2 <- function(path) {
      size <- file.info(path)$size
      readChar(path, size, useBytes = TRUE)
    }
    ```

 * As above, but using `readBin()`, then converting to a character vector. 
  Unfortunately you can't read into a character vector directly because
  use `type = "character"` is limited to 10000 characters

    ```{r}
    read_file3 <- function(path) {
      size <- file.info(path)$size
      rawToChar(readBin(path, "raw", size))
    }
    ```
    
 * A safer approach that doesn't use a separate call to `file.info()` - this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by [@klmr](http://twitter.com/klmr))

    ```{r}
    read_file4 <- function(path, chunk_size = 1e4) {
      con <- file(path, "rb", raw = TRUE)
      on.exit(close(con))
      
      # Guess approximate number of chunks
      n <- file.info(path)$size / chunk_size
      chunks <- vector("list", n)
    
      i <- 1L
      chunks[[i]] <- readBin(con, "raw", n = chunk_size)
      while(length(chunks[[i]]) == chunk_size) {
        i <- i + 1L
        chunks[[i]] <- readBin(con, "raw", n = chunk_size)
      }
      
      rawToChar(unlist(chunks, use.names = FALSE))
    }
    ```

 * An alternative would be to use C++.  This version was supplied by [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370)
  
    ```{r}
    library(Rcpp)
    sourceCpp("read-file.cpp")
    ```
  
 We'll compare the results on a file included with R:
 ```{r}
 path <- file.path(R.home("doc"), "COPYING")
 file.info(path)$size / 1024
 ```

 First we need to check they all return the same results. (They won't if the file
 doesn't include a trailing newline)

 ```{r}
 stopifnot(identical(read_file1(path), read_file2(path)))
 stopifnot(identical(read_file1(path), read_file3(path)))
 stopifnot(identical(read_file1(path), read_file4(path)))
 stopifnot(identical(read_file1(path), read_file_cpp(path)))
 stopifnot(identical(read_file1(path), read_file_cpp2(path)))
 ```

 The benchmarking results are clear: `readChar()` is the best base R option, and is 
 about four times faster for this file.  The safer approach using chunked `readBin()` reads is about 50% slower. The C++ function is both fast (2x faster than `readChar()` and 7x faster than `readLines()`) and safe.

 ```{r}
 library(microbenchmark)
 microbenchmark(
  readLines = read_file1(path),   
  readChar = read_file2(path),   
  readBin = read_file3(path),
  chunked_read = read_file4(path),
  Rcpp = read_file_cpp(path),
  Rcpp2 = read_file_cpp2(path)
 )
 ```
	#include <fstream>
	#include <string>
	#include <sstream>
	#include <Rcpp.h>
	using namespace Rcpp;

	// [[Rcpp::export]]
	CharacterVector read_file_cpp(CharacterVector path) {
	std::string fname = as<std::string>(path);
	std::ifstream t(fname.c_str());
	std::stringstream buffer;
	buffer << t.rdbuf();
	return buffer.str();
	}

	// [[Rcpp::export]]
	CharacterVector read_file_cpp2(CharacterVector path) {
	std::string fname = as<std::string>(path);
	std::ifstream in(fname.c_str());
	std::string contents;
	in.seekg(0, std::ios::end);
	contents.resize(in.tellg());
	in.seekg(0, std::ios::beg);
	in.read(&contents[0], contents.size());
	in.close();
	return(contents);
	}
	r``{r, echo = FALSE}
	library(microbenchmark)
	options(digits = 3)
	```

	# Reading a complete file with R

	This is a short exploration of the most efficient way to read a complete file
	(including newlines) into R - previously I'd used `readLines()` plus `paste()`
	but that's clearly the least efficient option.

	Here are the options:

	* Use `readLines()` and `paste()`

	```{r}
	read_file1 <- function(path) {
	paste0(paste0(readLines(path), collapse = "\n"), "\n")
	}
	```

	* Find out the size of the file and then use `readChar()`

	```{r}
	read_file2 <- function(path) {
	size <- file.info(path)$size
	readChar(path, size, useBytes = TRUE)
	}
	```

	* As above, but using `readBin()`, then converting to a character vector.
	Unfortunately you can't read into a character vector directly because
	use `type = "character"` is limited to 10000 characters

	```{r}
	read_file3 <- function(path) {
	size <- file.info(path)$size
	rawToChar(readBin(path, "raw", size))
	}
	```

	* A safer approach that doesn't use a separate call to `file.info()` - this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by [@klmr](http://twitter.com/klmr))

	```{r}
	read_file4 <- function(path, chunk_size = 1e4) {
	con <- file(path, "rb", raw = TRUE)
	on.exit(close(con))

	# Guess approximate number of chunks
	n <- file.info(path)$size / chunk_size
	chunks <- vector("list", n)

	i <- 1L
	chunks[[i]] <- readBin(con, "raw", n = chunk_size)
	while(length(chunks[[i]]) == chunk_size) {
	i <- i + 1L
	chunks[[i]] <- readBin(con, "raw", n = chunk_size)
	}

	rawToChar(unlist(chunks, use.names = FALSE))
	}
	```

	* An alternative would be to use C++. This version was supplied by [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370)

	```{r}
	library(Rcpp)
	sourceCpp("read-file.cpp")
	```

	We'll compare the results on a file included with R:
	```{r}
	path <- file.path(R.home("doc"), "COPYING")
	file.info(path)$size / 1024
	```

	First we need to check they all return the same results. (They won't if the file
	doesn't include a trailing newline)

	```{r}
	stopifnot(identical(read_file1(path), read_file2(path)))
	stopifnot(identical(read_file1(path), read_file3(path)))
	stopifnot(identical(read_file1(path), read_file4(path)))
	stopifnot(identical(read_file1(path), read_file_cpp(path)))
	stopifnot(identical(read_file1(path), read_file_cpp2(path)))
	```

	The benchmarking results are clear: `readChar()` is the best base R option, and is
	about four times faster for this file. The safer approach using chunked `readBin()` reads is about 50% slower. The C++ function is both fast (2x faster than `readChar()` and 7x faster than `readLines()`) and safe.

	```{r}
	library(microbenchmark)
	microbenchmark(
	readLines = read_file1(path),
	readChar = read_file2(path),
	readBin = read_file3(path),
	chunked_read = read_file4(path),
	Rcpp = read_file_cpp(path),
	Rcpp2 = read_file_cpp2(path)
	)
	```