quinnj · August 29, 2015 14:18
diff --git a/CSV Validation b/CSV Validation
 # Validation Rules
 # Each row must contain the same number of columns
 # Each field must be formatted correctly and be of the right type and format, must be one of:
  # Empty, missing value
  # DATE
  # DATETIME
  # INTEGER
  # DOUBLE
  # STRING
  # quoted by f.quotechar, with f.escapechar allowed within f.quotechar to specify literal f.quotechar, f.delim, or f.newline

 module CSV

 export CRLF

 if VERSION > v"0.4.0-dev"
    using Base.Dates
 else
    AbstractString = String
    using Dates
 end

 typealias Str AbstractString

 immutable CSVError <: Exception
    msg::ASCIIString
 end

 immutable Chars
    one::Char
    two::Char
 end

 const CRLF = Chars('\r','\n')

 typealias OneOrTwoChars Union(Char,Chars)

 @inline read!(f::IOStream,c::Ref{Char}) = ccall(:ios_getutf8, Cint, (Ptr{Void}, Ptr{Char}), f.ios, c)

 immutable File
    fullpath::Str
    delim::Char
    newline::OneOrTwoChars
    quotechar::Char
    escapechar::Char
    headerrow::Int
    datarow::Int
    footerskip::Int
    cols::Int
    types::Vector
    formats::Vector{Str}
 end

 function File(fullpath::Str,
              delim::Char=',';
              newline::OneOrTwoChars='\ufeff',
              quotechar::Char='"',
              escapechar::Char='\\',
              numcols::Int=0,
              types::Vector=[],
              formats::Vector=[],
              headerrow::Int=1,
              datarow::Int=2,
              footerskip::Int=0)
    # argument checks
    isfile(fullpath) || throw(ArgumentError("$fullpath is not a valid file"))
    datarow > headerrow || throw(ArgumentError("data row ($datarow) must come after header row ($headerrow)"))

    f = open(fullpath)
    c = Ref{Char}()
    # detect newline
    dn = newline == '\ufeff'
    # detect number of columns
    dc = numcols == 0

    if dn || dc
        n = newline
        ncols = 1
        while !eof(f)
            read!(f,c)
            if c.x == quotechar
                while !eof(f)
                    read!(f,c)
                    if c.x == e
                        read!(f,c)
                    elseif c.x == quotechar
                        break
                    end
                end
            elseif c.x == delim
                ncols += 1
            elseif c.x == '\n'
                n = '\n'
                break
            elseif c.x == '\r'
                read!(f,c)
                n = c.x == '\n' ? CRLF : '\r'
                break
            end
        end
        dn && n == '\ufeff' && throw(CSVError("couldn't detect a default newline, one of '\r', '\n', or \"\r\n\""))
    end
    newline = dn ? n : newline
    cols = dc ? ncols : numcols
    # seekstart(f)
    # # detect types of columns
    # dt = isempty(types)
    # lines = Array(AbstractString,50)
    # i = 1
    # while !eof(f)
    #     lines[i] = readuntil(f,newline)
    #     i += 1
    # end

    # types = dt ? ts : types
    if isempty(types)
        types = Array(DataType,cols)
        fill!(types,Str)
    end
    if isempty(formats)
        formats = Array(Str,length(types))
        fill!(formats,"")
    end
    return File(fullpath,delim,newline,quotechar,escapechar,headerrow,datarow,footerskip,cols,types,formats)
 end

 validatetype{T<:Str}(value::Str,::Type{T},f) = return
 function validatetype{T<:Real}(value::Str,::Type{T},f)
    t = tryparse(T,value)
    isnull(t) && throw(CSVError("$value is not a valid $T value"))
    return
 end
 function validatetype{T<:TimeType}(value::Str,::Type{T},f)
    T(value,f)
    return
 end

 @inline checknewline(f,c::Ref{Char},n::Char) = c.x::Char == n
 @inline function checknewline(f,c::Ref{Char},n::Chars)
    if c.x::Char == newline.one
        mark(f)
        read!(f,c)
        c.x::Char == newline.two && return true
        reset(f)
    end
    return false
 end

 function validateline!(file,f,c,i,buf,isheader,::Type{Val{false}})
    q = file.quotechar
    e = file.escapechar
    d = file.delim
    n = file.newline
    cols = file.cols
    fieldsfound = 1
    while !eof(f)
        CSV.read!(f,c)
        if c.x == q # if we run into a quote character
            while !eof(f) # keep reading until we reach the closing q
                CSV.read!(f,c)
                # if we run into an escape character
                if c.x == e
                    CSV.read!(f,c)
                elseif c.x == q
                    break
                end
            end
        elseif c.x == d # if we've found a delimiter
            fieldsfound += 1
        elseif c.x == n
            break
        end
    end
    fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
 end

 function validateline!(file,f,c,i,buf,isheader,::Type{Val{true}})
    q = file.quotechar
    e = file.escapechar
    d = file.delim
    n = file.newline
    cols = file.cols
    types = file.types
    formats = file.formats
    fieldsfound = 1
    while !eof(f)
        read!(f,c)
        if c.x == q # if we run into a quote character
            while !eof(f) # keep reading until we reach the closing q
                read!(f,c)
                # if we run into an escape character
                if c.x == e
                    write(buf,c.x)
                    read!(f,c)
                    write(buf,c.x) # auto read the next Char
                elseif c.x == q
                    break
                end
                write(buf,c.x)
            end
        elseif c.x == d # if we've found a delimiter
            t = takebuf_string(buf)
            !isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
            fieldsfound += 1
        elseif c.x == n
            break
        else
            write(buf,c.x)
        end
    end
    t = takebuf_string(buf)
    !isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
    fieldsfound == cols || throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
 end

 function skiplinesto!(file,f,c,i,n)
    newline = file.newline
    q = file.quotechar
    e = file.escapechar
    while !eof(f)
        i == n && break
        read!(f,c)
        if c.x == q
            while !eof(f)
                read!(f,c)
                if c.x == e
                    read!(f,c)
                elseif c.x == q
                    break
                end
            end
        elseif c.x == newline
            i + 1
        end
    end
    return i
 end

 # if size of buf is fixed, it needs to be at least as big as the largest expected field (value for a single column) in file
 function validate(file::File,buf::IOBuffer=IOBuffer();verbose::Bool=true,checktypes::Bool=true)
    f = open(file.fullpath)
    c = Ref{Char}()
    t = checktypes ? Val{true} : Val{false}
    i = 1
    i = skiplinesto!(file,f,c,i,file.headerrow)
    validateline!(file,f,c,file.headerrow,buf,true,t)
    i += 1
    i = skiplinesto!(file,f,c,i,file.datarow)
    while !eof(f)
        validateline!(file,f,c,i,buf,false,t)
        verbose && i % 100000 == 0 && println("Validated $i rows...")
        i += 1
    end
    return nothing
 end

 end # module
	# Validation Rules
	# Each row must contain the same number of columns
	# Each field must be formatted correctly and be of the right type and format, must be one of:
	# Empty, missing value
	# DATE
	# DATETIME
	# INTEGER
	# DOUBLE
	# STRING
	# quoted by f.quotechar, with f.escapechar allowed within f.quotechar to specify literal f.quotechar, f.delim, or f.newline

	module CSV

	export CRLF

	if VERSION > v"0.4.0-dev"
	using Base.Dates
	else
	AbstractString = String
	using Dates
	end

	typealias Str AbstractString

	immutable CSVError <: Exception
	msg::ASCIIString
	end

	immutable Chars
	one::Char
	two::Char
	end

	const CRLF = Chars('\r','\n')

	typealias OneOrTwoChars Union(Char,Chars)

	@inline read!(f::IOStream,c::Ref{Char}) = ccall(:ios_getutf8, Cint, (Ptr{Void}, Ptr{Char}), f.ios, c)

	immutable File
	fullpath::Str
	delim::Char
	newline::OneOrTwoChars
	quotechar::Char
	escapechar::Char
	headerrow::Int
	datarow::Int
	footerskip::Int
	cols::Int
	types::Vector
	formats::Vector{Str}
	end

	function File(fullpath::Str,
	delim::Char=',';
	newline::OneOrTwoChars='\ufeff',
	quotechar::Char='"',
	escapechar::Char='\\',
	numcols::Int=0,
	types::Vector=[],
	formats::Vector=[],
	headerrow::Int=1,
	datarow::Int=2,
	footerskip::Int=0)
	# argument checks
	isfile(fullpath) \|\| throw(ArgumentError("$fullpath is not a valid file"))
	datarow > headerrow \|\| throw(ArgumentError("data row ($datarow) must come after header row ($headerrow)"))

	f = open(fullpath)
	c = Ref{Char}()
	# detect newline
	dn = newline == '\ufeff'
	# detect number of columns
	dc = numcols == 0

	if dn \|\| dc
	n = newline
	ncols = 1
	while !eof(f)
	read!(f,c)
	if c.x == quotechar
	while !eof(f)
	read!(f,c)
	if c.x == e
	read!(f,c)
	elseif c.x == quotechar
	break
	end
	end
	elseif c.x == delim
	ncols += 1
	elseif c.x == '\n'
	n = '\n'
	break
	elseif c.x == '\r'
	read!(f,c)
	n = c.x == '\n' ? CRLF : '\r'
	break
	end
	end
	dn && n == '\ufeff' && throw(CSVError("couldn't detect a default newline, one of '\r', '\n', or \"\r\n\""))
	end
	newline = dn ? n : newline
	cols = dc ? ncols : numcols
	# seekstart(f)
	# # detect types of columns
	# dt = isempty(types)
	# lines = Array(AbstractString,50)
	# i = 1
	# while !eof(f)
	# lines[i] = readuntil(f,newline)
	# i += 1
	# end

	# types = dt ? ts : types
	if isempty(types)
	types = Array(DataType,cols)
	fill!(types,Str)
	end
	if isempty(formats)
	formats = Array(Str,length(types))
	fill!(formats,"")
	end
	return File(fullpath,delim,newline,quotechar,escapechar,headerrow,datarow,footerskip,cols,types,formats)
	end

	validatetype{T<:Str}(value::Str,::Type{T},f) = return
	function validatetype{T<:Real}(value::Str,::Type{T},f)
	t = tryparse(T,value)
	isnull(t) && throw(CSVError("$value is not a valid $T value"))
	return
	end
	function validatetype{T<:TimeType}(value::Str,::Type{T},f)
	T(value,f)
	return
	end

	@inline checknewline(f,c::Ref{Char},n::Char) = c.x::Char == n
	@inline function checknewline(f,c::Ref{Char},n::Chars)
	if c.x::Char == newline.one
	mark(f)
	read!(f,c)
	c.x::Char == newline.two && return true
	reset(f)
	end
	return false
	end

	function validateline!(file,f,c,i,buf,isheader,::Type{Val{false}})
	q = file.quotechar
	e = file.escapechar
	d = file.delim
	n = file.newline
	cols = file.cols
	fieldsfound = 1
	while !eof(f)
	CSV.read!(f,c)
	if c.x == q # if we run into a quote character
	while !eof(f) # keep reading until we reach the closing q
	CSV.read!(f,c)
	# if we run into an escape character
	if c.x == e
	CSV.read!(f,c)
	elseif c.x == q
	break
	end
	end
	elseif c.x == d # if we've found a delimiter
	fieldsfound += 1
	elseif c.x == n
	break
	end
	end
	fieldsfound == cols \|\| throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
	end

	function validateline!(file,f,c,i,buf,isheader,::Type{Val{true}})
	q = file.quotechar
	e = file.escapechar
	d = file.delim
	n = file.newline
	cols = file.cols
	types = file.types
	formats = file.formats
	fieldsfound = 1
	while !eof(f)
	read!(f,c)
	if c.x == q # if we run into a quote character
	while !eof(f) # keep reading until we reach the closing q
	read!(f,c)
	# if we run into an escape character
	if c.x == e
	write(buf,c.x)
	read!(f,c)
	write(buf,c.x) # auto read the next Char
	elseif c.x == q
	break
	end
	write(buf,c.x)
	end
	elseif c.x == d # if we've found a delimiter
	t = takebuf_string(buf)
	!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
	fieldsfound += 1
	elseif c.x == n
	break
	else
	write(buf,c.x)
	end
	end
	t = takebuf_string(buf)
	!isheader && validatetype(t,types[fieldsfound],formats[fieldsfound])
	fieldsfound == cols \|\| throw(CSVError("error parsing line $i: expected $cols fields, only detected $fieldsfound"))
	end

	function skiplinesto!(file,f,c,i,n)
	newline = file.newline
	q = file.quotechar
	e = file.escapechar
	while !eof(f)
	i == n && break
	read!(f,c)
	if c.x == q
	while !eof(f)
	read!(f,c)
	if c.x == e
	read!(f,c)
	elseif c.x == q
	break
	end
	end
	elseif c.x == newline
	i + 1
	end
	end
	return i
	end

	# if size of buf is fixed, it needs to be at least as big as the largest expected field (value for a single column) in file
	function validate(file::File,buf::IOBuffer=IOBuffer();verbose::Bool=true,checktypes::Bool=true)
	f = open(file.fullpath)
	c = Ref{Char}()
	t = checktypes ? Val{true} : Val{false}
	i = 1
	i = skiplinesto!(file,f,c,i,file.headerrow)
	validateline!(file,f,c,file.headerrow,buf,true,t)
	i += 1
	i = skiplinesto!(file,f,c,i,file.datarow)
	while !eof(f)
	validateline!(file,f,c,i,buf,false,t)
	verbose && i % 100000 == 0 && println("Validated $i rows...")
	i += 1
	end
	return nothing
	end

	end # module