infotroph · August 29, 2015 14:22 · infotroph · Jun 1, 2015 · bpbond · Jun 1, 2015
diff --git a/read.longline.R b/read.longline.R
 # My input files have short header lines, then CSV data, then short footer lines.
 # I'm currently trimming the short lines with an external call to sed, 
 # but I want a pure-R solution for portability.

 # This version works nicely on small examples but gets very slow on large files, 
 # because append() grows the list, triggering a memory reallocation, for every line.
 # Suggestions for speed improvement requested.

 read.longline = function(file){
 	f = file(file, "r")
 	lines = list() 
 	repeat{ # read short headers & discard
 		l = readLines(f, n=1)
 		if(length(l) > 0 && nchar(l) > 65){
 			# We've found the first data row. 
 			# Leave it on the stack to process in the next loop.
 			pushBack(l, f)
 			break
 		}
 	}
 	repeat{ # read long lines, add to CSV, break when short lines start again
 		l = readLines(f, n=1)
 		if(length(l) > 0 && nchar(l) > 65){
 			# Naive implementation!
 			# Likely to be VERY slow because we're growing lines every time.
 			lines = append(lines, l)
 		}else{ 
 			# Either we've hit a short line == beginning of PGP block,
 			# or empty line == end of the file. 
 			# Either way we're done.
 			break
 		}
 	}
 	close(f)
 	# Now stitch lines together into a dataframe
 	txtdat = do.call("paste", c(lines, sep="\n"))
 	return(read.csv(text=txtdat, stringsAsFactors=FALSE))
 }
	# My input files have short header lines, then CSV data, then short footer lines.
	# I'm currently trimming the short lines with an external call to sed,
	# but I want a pure-R solution for portability.

	# This version works nicely on small examples but gets very slow on large files,
	# because append() grows the list, triggering a memory reallocation, for every line.
	# Suggestions for speed improvement requested.

	read.longline = function(file){
	f = file(file, "r")
	lines = list()
	repeat{ # read short headers & discard
	l = readLines(f, n=1)
	if(length(l) > 0 && nchar(l) > 65){
	# We've found the first data row.
	# Leave it on the stack to process in the next loop.
	pushBack(l, f)
	break
	}
	}
	repeat{ # read long lines, add to CSV, break when short lines start again
	l = readLines(f, n=1)
	if(length(l) > 0 && nchar(l) > 65){
	# Naive implementation!
	# Likely to be VERY slow because we're growing lines every time.
	lines = append(lines, l)
	}else{
	# Either we've hit a short line == beginning of PGP block,
	# or empty line == end of the file.
	# Either way we're done.
	break
	}
	}
	close(f)
	# Now stitch lines together into a dataframe
	txtdat = do.call("paste", c(lines, sep="\n"))
	return(read.csv(text=txtdat, stringsAsFactors=FALSE))
	}