infotroph · August 29, 2015 14:22 · infotroph · Jun 9, 2015 · HenrikBengtsson · Jun 9, 2015
diff --git a/readtime.r b/readtime.r
 # Context: I have untidy CSVs that need some junk lines filtered out before they're even grid-shaped. 
 # I currently do the filtering with an external sed call, 
 # but wanted something that would work on any OS.

 # In https://gist.github.com/infotroph/dd0faa5fd24bb78b4ff6 
 # I asked how to do the filtering from within R, 
 # and settled on readLines -> filter -> send filtered lines back to read.csv.

 # This script doesn't filter anything, 
 # it just tests different ways of passing lines back into read.csv afterwards: 
 # As an array of strings, as one single string with embedded newlines, or as a textConnection.


 library(microbenchmark)
 library(readr)

 TMPFILE = "readtime_tmp.csv"

 NROWS = 1e3 # Write/read this many rows to/from temp file.
 NTIMES = 5	# Run the benchmark this many times.
 # N.B. Keep these both very small at first!
 # On my machine even 5000 lines make f_strvec() and f_con() take >30 sec per read.


 # Write some fake data to temp file.
 # Here I'm using 1 column character data, then 104 columns of doubles.
 cat( # headers
 	paste(c("id", rep(letters,4)), collapse=","),
 	sep="\n",
 	file=TMPFILE)
 invisible(replicate( # data
 	NROWS,
 	cat(
 		paste(c(sample(letters, 1), rnorm(26*4)), collapse=","),
 		sep="\n",
 		file=TMPFILE,
 		append=TRUE)))

 colClasses = c("character", rep("numeric", 26*4))
 readr_classes=paste(c("c", rep("d", 26*4)), collapse="")

 # Baseline for comparison: Standard call to read.csv
 f_direct = function(){
 	return(read.csv(TMPFILE, colClasses=colClasses))
 }

 # File contents as vector of character strings
 f_strvec = function(){
 	x = readLines(TMPFILE)
 	return(read.csv(text=x, colClasses=colClasses))
 }

 # File contents glued into one single string before passing
 f_onestr = function(){
 	x=readLines(TMPFILE)
 	return(read.csv(
 		text=do.call("paste", c(as.list(x), sep="\n")), 
 		colClasses=colClasses))
 }

 # Vector of strings passed as a TextConnection
 f_con = function(){
 	x = readLines(TMPFILE)
 	xc = textConnection(x)
 	on.exit(close(xc))
 	return(read.csv(xc, colClasses=colClasses))
 }

 # Hadleyfied readlines, base read.csv
 f_readr_lines = function(){
 	x=read_lines(TMPFILE)
 	return(read.csv(
 		text=do.call("paste", c(as.list(x), sep="\n")), 
 		colClasses=colClasses))
 }

 # Both readlines and csv read Hadleyfied. 
 # N.B. On first several tries, this threw 
 # "C stack usage is too close to the limit", 
 # but haven't been able to reproduce that since. 
 # I wouldn't count on this function working for very large files.
 f_readr_both = function(){
 	x=read_lines(TMPFILE)
 	return(read_csv(
 		file=do.call("paste", c(as.list(x), sep="\n")), 
 		col_types=readr_classes))
 }


 cat(paste("Reading", NROWS, "lines.\n"))

 a=f_direct() 
 b=f_strvec() 
 c=f_con()
 d=f_onestr()
 e=f_readr_lines()
 f=f_readr_both() 

 # read_csv()'s default column names aren't the same as read.csv()'s.
 # Let's make them match for comparison.
 names(e) = make.names(names(e), unique=TRUE)
 names(f) = make.names(names(f), unique=TRUE)
 class(e) = "data.frame"
 class(f) = "data.frame"

 # Did all methods return the same dataframe?
 stopifnot(all(
 	all.equal(a,b), 
 	all.equal(a,c), 
 	all.equal(a,d), 
 	all.equal(a,e), 
 	all.equal(a,f)))

 rm(list=c("a", "b", "c", "d", "e", "f"))

 print(microbenchmark(
 	a=f_direct(), 
 	b=f_strvec(), 
 	c=f_con(), 
 	d=f_onestr(), 
 	e=f_readr_lines(), 
 	f=f_readr_both(), 
 	times=NTIMES))
	# Context: I have untidy CSVs that need some junk lines filtered out before they're even grid-shaped.
	# I currently do the filtering with an external sed call,
	# but wanted something that would work on any OS.

	# In https://gist.github.com/infotroph/dd0faa5fd24bb78b4ff6
	# I asked how to do the filtering from within R,
	# and settled on readLines -> filter -> send filtered lines back to read.csv.

	# This script doesn't filter anything,
	# it just tests different ways of passing lines back into read.csv afterwards:
	# As an array of strings, as one single string with embedded newlines, or as a textConnection.


	library(microbenchmark)
	library(readr)

	TMPFILE = "readtime_tmp.csv"

	NROWS = 1e3 # Write/read this many rows to/from temp file.
	NTIMES = 5 # Run the benchmark this many times.
	# N.B. Keep these both very small at first!
	# On my machine even 5000 lines make f_strvec() and f_con() take >30 sec per read.


	# Write some fake data to temp file.
	# Here I'm using 1 column character data, then 104 columns of doubles.
	cat( # headers
	paste(c("id", rep(letters,4)), collapse=","),
	sep="\n",
	file=TMPFILE)
	invisible(replicate( # data
	NROWS,
	cat(
	paste(c(sample(letters, 1), rnorm(26*4)), collapse=","),
	sep="\n",
	file=TMPFILE,
	append=TRUE)))

	colClasses = c("character", rep("numeric", 26*4))
	readr_classes=paste(c("c", rep("d", 26*4)), collapse="")

	# Baseline for comparison: Standard call to read.csv
	f_direct = function(){
	return(read.csv(TMPFILE, colClasses=colClasses))
	}

	# File contents as vector of character strings
	f_strvec = function(){
	x = readLines(TMPFILE)
	return(read.csv(text=x, colClasses=colClasses))
	}

	# File contents glued into one single string before passing
	f_onestr = function(){
	x=readLines(TMPFILE)
	return(read.csv(
	text=do.call("paste", c(as.list(x), sep="\n")),
	colClasses=colClasses))
	}

	# Vector of strings passed as a TextConnection
	f_con = function(){
	x = readLines(TMPFILE)
	xc = textConnection(x)
	on.exit(close(xc))
	return(read.csv(xc, colClasses=colClasses))
	}

	# Hadleyfied readlines, base read.csv
	f_readr_lines = function(){
	x=read_lines(TMPFILE)
	return(read.csv(
	text=do.call("paste", c(as.list(x), sep="\n")),
	colClasses=colClasses))
	}

	# Both readlines and csv read Hadleyfied.
	# N.B. On first several tries, this threw
	# "C stack usage is too close to the limit",
	# but haven't been able to reproduce that since.
	# I wouldn't count on this function working for very large files.
	f_readr_both = function(){
	x=read_lines(TMPFILE)
	return(read_csv(
	file=do.call("paste", c(as.list(x), sep="\n")),
	col_types=readr_classes))
	}


	cat(paste("Reading", NROWS, "lines.\n"))

	a=f_direct()
	b=f_strvec()
	c=f_con()
	d=f_onestr()
	e=f_readr_lines()
	f=f_readr_both()

	# read_csv()'s default column names aren't the same as read.csv()'s.
	# Let's make them match for comparison.
	names(e) = make.names(names(e), unique=TRUE)
	names(f) = make.names(names(f), unique=TRUE)
	class(e) = "data.frame"
	class(f) = "data.frame"

	# Did all methods return the same dataframe?
	stopifnot(all(
	all.equal(a,b),
	all.equal(a,c),
	all.equal(a,d),
	all.equal(a,e),
	all.equal(a,f)))

	rm(list=c("a", "b", "c", "d", "e", "f"))

	print(microbenchmark(
	a=f_direct(),
	b=f_strvec(),
	c=f_con(),
	d=f_onestr(),
	e=f_readr_lines(),
	f=f_readr_both(),
	times=NTIMES))