Explorations in reading CSV files in Julia

Matthew Dowle is working on a fast CSV reader for data.table. Here is test data case generated in R along with some timings:

require(data.table)

n=1e6
DT = data.table( a=sample(1:1000,n,replace=TRUE),
                 b=sample(1:1000,n,replace=TRUE),
                 c=rnorm(n),
                 d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
                 e=rnorm(n),
                 f=sample(1:1000,n,replace=TRUE) )
DT[2,b:=NA_integer_]
DT[4,c:=NA_real_]
DT[3,d:=NA_character_]
DT[5,d:=""]
DT[2,e:=+Inf]
DT[3,e:=-Inf]

write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
cat("File size (MB):",round(file.info("test.csv")$size/1024^2),"\n")    # 50 MB (1e6 rows x 6 columns)


system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))         # 60 sec (first time in fresh R session)
system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))         # 30 sec (immediate repeat is faster, varies)

system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="",  # 10 sec (consistently)
    stringsAsFactors=FALSE,comment.char="",nrows=n,                     # ( All known tricks and known
    colClasses=c("integer","integer","numeric",                         #   nrows, see references )
                 "character","numeric","integer")))

require(data.table)
system.time(DT <- fread("test.csv"))                                    #  5 sec (faster and friendlier)

# The timings above were Matthew's. Here are mine on an SSD:
# R base:     15.5 secs
# R tricks:    3.4 secs
# data.table:  1.4 secs

Now, let's do some timings in Julia reading this file with the current read_table:

load("DataFrames"); using DataFrames

fname = "test.csv"

@time d = read_table(fname)       # 26 secs

It's fairly slow. Here's the time it takes to read in a whole buffer:

@time a = open(readall, fname)    # 0.14 sec

That's fast, but stepping through character by character is slow:

@time for i in 1:length(a)        # 15 secs
    chr = a[i]
end

Reading line by line if pretty fast:

@time a = open(readlines, fname)  # 0.75 sec

Based on this, I tried a quick-and-dirty CSV reader that uses sscanf reading line by line. Matthew Dowle uses fscanf to pull in values. I couldn't get fscanf to work with ccall, so I bailed out on that. Anyway, here's the code:

load("DataFrames"); using DataFrames


function myDataFrame(column_types::Vector, n::Int64)
  # The default constructor for DataFrames with this signature is ridiculously slow
  p = length(column_types)
  columns = Array(Any, p)
  names = Array(ByteString, p)
  for j in 1:p
    names[j] = "x$j"
  end
  for j in 1:p
    columns[j] = DataVec(Array(column_types[j], n), falses(n))
  end
  DataFrame(columns, names)
end


import DataFrames.read_table


function read_table(io::IOStream,
                    separator::Char,
                    quotation_character::Char,
                    missingness_indicator::String,
                    header::Bool,
                    column_types::Vector,
                    nrows::Int)
    d = myDataFrame(column_types, nrows)
    prep_sscanf(d)
    ncols = length(column_types)
    readline(io) # TODO: read the header to get names
    for i = 1:nrows
         ln = readline(io) # TODO: read the header to get names
         ## @show(ln)
         res = my_sscanf(ln, d, i)
         ## @show(res)
    end
    d
end

getpointer{T<:Number}(x::Vector{T}, i::Int) = pointer(x, i)
getpointer{T<:Number}(x::DataVec{T}, i::Int) = pointer(x.data, i)
function getpointer{T<:String}(x::DataVec{T}, i::Int)
    x[i] = "                                                                                                                                                                                                    "
    pointer(x.data[i].data)
end

basetype(x) = eltype(x)
basetype{T<:ByteString}(x::DataVec{T}) = Uint8

function prep_sscanf(df::AbstractDataFrame)
    typemap = [Int => "%9d", Float64 => "%lg", Uint8 => "%[^,\n\r]", ASCIIString => "%[^,\n\r]"]
    types   = colwise(basetype, df)
    typestr = map(x -> typemap[x], types)
    @show(typestr)
    format = join(typestr, ",")
    typelist = map(x -> Ptr{x}, types)
    @show typelist
    template = :(
        my_sscanf(dataline, df::AbstractDataFrame, i::Int) =
            ccall(:sscanf, Int,
                   (Ptr{Uint8},Ptr{Uint8}),
                   dataline, $format)
    )
    for k in 1:ncol(df)
        push(template.args[2].args[3].args, typelist[k])
        push(template.args[2].args, :(getpointer(df[$k],i)))
    end
    eval(template)
end


fname = "test.csv"
f = open(fname, "r")
@time d = read_table(f, ',', '"', "NA", true, {Int, Int, Float64, ASCIIString, Float64, Int}, 1000000)      #"
close(f)
# elapsed time: 4.601014137268066 seconds

This is pretty crude, but it does show that it is possible to get faster CSV reading. The code is kind-of tricky in that I modified an expression to get the ccall function call the way I wanted it. This routine obviously isn't very robust. The string handling is particularly clumsy. The idea is that you would use fscanf to rip through the file, but if fscanf fails because of NA's on a line or other troublesome parts, the code would retry with a slower algorithm.

tshort/csv.md