Matthew Dowle is working on a fast CSV reader for data.table. Here is test data case generated in R along with some timings:
require(data.table)
n=1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
b=sample(1:1000,n,replace=TRUE),
c=rnorm(n),
d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
e=rnorm(n),
f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]
write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
cat("File size (MB):",round(file.info("test.csv")$size/1024^2),"\n") # 50 MB (1e6 rows x 6 columns)
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE)) # 60 sec (first time in fresh R session)
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE)) # 30 sec (immediate repeat is faster, varies)
system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="", # 10 sec (consistently)
stringsAsFactors=FALSE,comment.char="",nrows=n, # ( All known tricks and known
colClasses=c("integer","integer","numeric", # nrows, see references )
"character","numeric","integer")))
require(data.table)
system.time(DT <- fread("test.csv")) # 5 sec (faster and friendlier)
# The timings above were Matthew's. Here are mine on an SSD:
# R base: 15.5 secs
# R tricks: 3.4 secs
# data.table: 1.4 secs
Now, let's do some timings in Julia reading this file with the current read_table
:
load("DataFrames"); using DataFrames
fname = "test.csv"
@time d = read_table(fname) # 26 secs
It's fairly slow. Here's the time it takes to read in a whole buffer:
@time a = open(readall, fname) # 0.14 sec
That's fast, but stepping through character by character is slow:
@time for i in 1:length(a) # 15 secs
chr = a[i]
end
Reading line by line if pretty fast:
@time a = open(readlines, fname) # 0.75 sec
Based on this, I tried a quick-and-dirty CSV reader that uses sscanf
reading line by line. Matthew Dowle uses fscanf
to pull in values. I couldn't get fscanf
to work with ccall
, so I bailed out on that. Anyway, here's the code:
load("DataFrames"); using DataFrames
function myDataFrame(column_types::Vector, n::Int64)
# The default constructor for DataFrames with this signature is ridiculously slow
p = length(column_types)
columns = Array(Any, p)
names = Array(ByteString, p)
for j in 1:p
names[j] = "x$j"
end
for j in 1:p
columns[j] = DataVec(Array(column_types[j], n), falses(n))
end
DataFrame(columns, names)
end
import DataFrames.read_table
function read_table(io::IOStream,
separator::Char,
quotation_character::Char,
missingness_indicator::String,
header::Bool,
column_types::Vector,
nrows::Int)
d = myDataFrame(column_types, nrows)
prep_sscanf(d)
ncols = length(column_types)
readline(io) # TODO: read the header to get names
for i = 1:nrows
ln = readline(io) # TODO: read the header to get names
## @show(ln)
res = my_sscanf(ln, d, i)
## @show(res)
end
d
end
getpointer{T<:Number}(x::Vector{T}, i::Int) = pointer(x, i)
getpointer{T<:Number}(x::DataVec{T}, i::Int) = pointer(x.data, i)
function getpointer{T<:String}(x::DataVec{T}, i::Int)
x[i] = " "
pointer(x.data[i].data)
end
basetype(x) = eltype(x)
basetype{T<:ByteString}(x::DataVec{T}) = Uint8
function prep_sscanf(df::AbstractDataFrame)
typemap = [Int => "%9d", Float64 => "%lg", Uint8 => "%[^,\n\r]", ASCIIString => "%[^,\n\r]"]
types = colwise(basetype, df)
typestr = map(x -> typemap[x], types)
@show(typestr)
format = join(typestr, ",")
typelist = map(x -> Ptr{x}, types)
@show typelist
template = :(
my_sscanf(dataline, df::AbstractDataFrame, i::Int) =
ccall(:sscanf, Int,
(Ptr{Uint8},Ptr{Uint8}),
dataline, $format)
)
for k in 1:ncol(df)
push(template.args[2].args[3].args, typelist[k])
push(template.args[2].args, :(getpointer(df[$k],i)))
end
eval(template)
end
fname = "test.csv"
f = open(fname, "r")
@time d = read_table(f, ',', '"', "NA", true, {Int, Int, Float64, ASCIIString, Float64, Int}, 1000000) #"
close(f)
# elapsed time: 4.601014137268066 seconds
This is pretty crude, but it does show that it is possible to get faster CSV reading. The code is kind-of tricky in that I modified an expression to get the ccall
function call the way I wanted it. This routine obviously isn't very robust. The string handling is particularly clumsy. The idea is that you would use fscanf
to rip through the file, but if fscanf
fails because of NA's on a line or other troublesome parts, the code would retry with a slower algorithm.