Last active
August 29, 2015 14:18
-
-
Save quinnj/2054b2018a5fb32ee983 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Validation Rules | |
# Each row must contain the same number of columns | |
# Each field must be formatted correctly and be of the right type and format, must be one of: | |
# Empty, missing value | |
# DATE | |
# DATETIME | |
# INTEGER | |
# DOUBLE | |
# STRING | |
# quoted by f.quotechar, with f.escapechar allowed within f.quotechar to specify literal f.quotechar, f.delim, or f.newline | |
module CSV | |
export CRLF | |
if VERSION > v"0.4.0-dev" | |
using Base.Dates | |
else | |
AbstractString = String | |
using Dates | |
end | |
typealias Str AbstractString | |
immutable CSVError <: Exception | |
msg::ASCIIString | |
end | |
immutable Chars | |
one::Char | |
two::Char | |
end | |
const CRLF = Chars('\r','\n') | |
typealias OneOrTwoChars Union(Char,Chars) | |
@inline read!(f::IOStream,c::Ref{Char}) = ccall(:ios_getutf8, Cint, (Ptr{Void}, Ptr{Char}), f.ios, c) | |
immutable File | |
fullpath::Str | |
delim::Char | |
newline::OneOrTwoChars | |
quotechar::Char | |
escapechar::Char | |
headerrow::Int | |
datarow::Int | |
footerskip::Int | |
cols::Int | |
types::Vector | |
formats::Vector{Str} | |
end | |
function File(fullpath::Str, | |
delim::Char=','; | |
newline::OneOrTwoChars='\ufeff', | |
quotechar::Char='"', | |
escapechar::Char='\\', | |
numcols::Int=0, | |
types::Vector=[], | |
formats::Vector=[], | |
headerrow::Int=1, | |
datarow::Int=2, | |
footerskip::Int=0) | |
# argument checks | |
isfile(fullpath) || throw(ArgumentError("$fullpath is not a valid file")) | |
datarow > headerrow || throw(ArgumentError("data row ($datarow) must come after header row ($headerrow)")) | |
f = open(fullpath) | |
c = Ref{Char}() | |
# detect newline | |
dn = newline == '\ufeff' | |
# detect number of columns | |
dc = numcols == 0 | |
if dn || dc | |
n = newline | |
ncols = 1 | |
while !eof(f) | |
read!(f,c) | |
if c.x == quotechar | |
while !eof(f) | |
read!(f,c) | |
if c.x == e | |
read!(f,c) | |
elseif c.x == quotechar | |
break | |
end | |
end | |
elseif c.x == delim | |
ncols += 1 | |
elseif c.x == '\n' | |
n = '\n' | |
break | |
elseif c.x == '\r' | |
read!(f,c) | |
n = c.x == '\n' ? CRLF : '\r' | |
break | |
end | |
end | |
dn && n == '\ufeff' && throw(CSVError("couldn't detect a default newline, one of '\r', '\n', or \"\r\n\"")) | |
end | |
newline = dn ? n : newline | |
cols = dc ? ncols : numcols | |
# seekstart(f) | |
# # detect types of columns | |
# dt = isempty(types) | |
# lines = Array(AbstractString,50) | |
# i = 1 | |
# while !eof(f) | |
# lines[i] = readuntil(f,newline) | |
# i += 1 | |
# end | |
# types = dt ? ts : types | |
if isempty(types) | |
types = Array(DataType,cols) | |
fill!(types,Str) | |
end | |
if isempty(formats) | |
formats = Array(Str,length(types)) | |
fill!(formats,"") | |
end | |
return File(fullpath,delim,newline,quotechar,escapechar,headerrow,datarow,footerskip,cols,types,formats) | |
end | |
validatetype{T<:Str}(value::Str,::Type{T},f) = return | |
function validatetype{T<:Real}(value::Str,::Type{T},f) | |
t = tryparse(T,value) | |
isnull(t) && throw(CSVError("$value is not a valid $T value")) | |
return | |
end | |
function validatetype{T<:TimeType}(value::Str,::Type{T},f) | |
T(value,f) | |
return | |
end | |
@inline checknewline(f,c::Ref{Char},n::Char) = c.x::Char == n | |
@inline function checknewline(f,c::Ref{Char},n::Chars) | |
if c.x::Char == newline.one | |
mark(f) | |
read!(f,c) | |
c.x::Char == newline.two && return true | |
reset(f) | |
end | |
return false | |
end | |
function validateline!(file,f,c,i,buf,isheader,::Type{Val{false}}) | |
q = file.quotechar | |
e = file.escapechar | |
d = file.delim | |
n = file.newline | |
cols = file.cols | |
fieldsfound = 1 | |
while !eof(f) | |
CSV.read!(f,c) | |
if c.x == q # if we run into a quote character | |
while !eof(f) # keep reading until we reach the closing q | |
CSV.read!(f,c) | |
# if we run into an escape character | |
if c.x == e | |
CSV.read!(f,c) | |
elseif c.x == q | |
break | |
end | |
end | |
elseif c.x == d # if we've found a delimiter | |
fieldsfound += 1 | |
elseif c.x == n | |
break | |
end | |
end | |
fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound")) | |
end | |
function validateline!(file,f,c,i,buf,isheader,::Type{Val{true}}) | |
q = file.quotechar | |
e = file.escapechar | |
d = file.delim | |
n = file.newline | |
cols = file.cols | |
types = file.types | |
formats = file.formats | |
fieldsfound = 1 | |
while !eof(f) | |
read!(f,c) | |
if c.x == q # if we run into a quote character | |
while !eof(f) # keep reading until we reach the closing q | |
read!(f,c) | |
# if we run into an escape character | |
if c.x == e | |
write(buf,c.x) | |
read!(f,c) | |
write(buf,c.x) # auto read the next Char | |
elseif c.x == q | |
break | |
end | |
write(buf,c.x) | |
end | |
elseif c.x == d # if we've found a delimiter | |
t = takebuf_string(buf) | |
!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound]) | |
fieldsfound += 1 | |
elseif c.x == n | |
break | |
else | |
write(buf,c.x) | |
end | |
end | |
t = takebuf_string(buf) | |
!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound]) | |
fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound")) | |
end | |
function skiplinesto!(file,f,c,i,n) | |
newline = file.newline | |
q = file.quotechar | |
e = file.escapechar | |
while !eof(f) | |
i == n && break | |
read!(f,c) | |
if c.x == q | |
while !eof(f) | |
read!(f,c) | |
if c.x == e | |
read!(f,c) | |
elseif c.x == q | |
break | |
end | |
end | |
elseif c.x == newline | |
i + 1 | |
end | |
end | |
return i | |
end | |
# if size of buf is fixed, it needs to be at least as big as the largest expected field (value for a single column) in file | |
function validate(file::File,buf::IOBuffer=IOBuffer();verbose::Bool=true,checktypes::Bool=true) | |
f = open(file.fullpath) | |
c = Ref{Char}() | |
t = checktypes ? Val{true} : Val{false} | |
i = 1 | |
i = skiplinesto!(file,f,c,i,file.headerrow) | |
validateline!(file,f,c,file.headerrow,buf,true,t) | |
i += 1 | |
i = skiplinesto!(file,f,c,i,file.datarow) | |
while !eof(f) | |
validateline!(file,f,c,i,buf,false,t) | |
verbose && i % 100000 == 0 && println("Validated $i rows...") | |
i += 1 | |
end | |
return nothing | |
end | |
end # module |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment