tpapp · May 13, 2017 06:15
diff --git a/test.jl b/test.jl
 ######################################################################
 # MWE for tabulating large CSV files, using plain Julia and IterateTables
 # See the discussion at
 # https://discourse.julialang.org/t/iterate-delimited-file-as-namedtuples/3616/3
 #
 # Some simplifications:
 # - using the index of iterated tuples to make it comparable,
 #   but would prefer to use the column name in real code.
 # - not using compressed files (because CSV does not support Zlib streams ATM)
 ######################################################################

 using CSV
 using IterableTables

 ######################################################################
 # generating fake data
 ######################################################################

 """
 Generate `lines` of fake data, overwriting `filename`.

 Each line has the given `columns`, filled with random integers, except
 for the `col_index` column which is filled with random
 `values`. `delim` is the delimiter.
 """
 function gen_data(filename, lines, columns, col_index;
                  values = string.('A':'Z'), delim = ';')
    @assert col_index ∈ 1:columns
    io = open(filename, "w")
    for i in 1:lines
        for j in 1:(col_index-1)
            print(io, rand(1:100000), ";")
        end
        print(io, rand(values))
        for j in (col_index+1):columns
            print(io, ";", rand(1:100000))
        end
        println(io)
    end
    close(io)
 end
                  

 ######################################################################
 # the dumb version
 ######################################################################

 """
 Read lines from `io`, call `f` on its fields as a vector.

 **Important**: `split` is used with `delim`. No fancy CSV escapes.

 # Keyword arguments:

 - when `progress > 0`, print a dot for that many lines read.
 - when `limit > 0`, only split up to that many fields. 
 - `delim` is the delimiter.
 - when `maxlines > 0`, limit the number of lines read (for testing etc).
 """
 function dolines(f, io;
                 progress = 1000000, limit = 0, delim = ';', maxlines = 0)
    line = 0
    while !eof(io)
        if isa(progress, Integer) && line % progress == 0
            print(".")
        end
        line += 1
        if maxlines > 0 && line ≥ maxlines
            break
        end
        f(split(readline(io), delim; limit = limit))
    end
 end

 "Count the frequencies of strings in `col_index`, return as a `Dict`."
 function count_field(filename, col_index; options...)
    c = Dict{String,Int}()
    open(filename, "r") do io
        dolines(io;
                limit = col_index + 1, delim = ';', options...) do fields
                    kind = fields[col_index]
                    c[kind] = get(c, kind, 0) + 1
                end
    end
    c
 end

 ######################################################################
 # using IterateTables and CSV
 ######################################################################

 """
 Same as `count_field` above, but using an iterator over lines, picking
 a fixed column called `AM`.
 """
 function count_field_iterate_colname(itr)
    c = Dict{String, Int}()
    for i in itr
        field = i.AM            # colum fixed
        if !isnull(field)
            kind = get(field)
            c[kind] = get(c, kind, 0) + 1
        end
    end
    c
 end

 function count_field_iterate_colname(filename::String,
                                     col_index, col_num; delim = ';')
    open(filename, "r") do io
        header = string.(1:col_num)
        header[col_index] = "AM"
        csv = CSV.Source(io, types=fill(String, col_num), delim = ';', header = header)
        itr = getiterator(csv)
        count_field_iterate_colname(itr)
    end
 end

 """
 Same as `count_field` above, but using an iterator over lines, picking
 a column with a given index.
 """
 function count_field_iterate_index(itr, col_index)
    c = Dict{String, Int}()
    for i in itr
        field = i[col_index]
        if !isnull(field)
            kind = get(field)
            c[kind] = get(c, kind, 0) + 1
        end
    end
    c
 end

 function count_field_iterate_index(filename::String,
                                     col_index, col_num; delim = ';')
    open(filename, "r") do io
        header = string.(1:col_num)
        csv = CSV.Source(io, types=fill(String, col_num), delim = ';',
                         header = header)
        itr = getiterator(csv)
        count_field_iterate_index(itr, col_index)
    end
 end

 ######################################################################
 # runtime code
 ######################################################################

 file = tempname() * ".csv"      # make sure you have enough space
 col_index = 5
 col_num = 20

 gen_data(file, 10^6, col_num, col_index) # try 10^8 for the actual data I was using

 # compilation time included, but runtime should dominate

 # using basic Julia
 @time c1 = count_field(file, col_index)

 # iterate, use a named column
 @time c2 = count_field_iterate_colname(file, col_index, col_num)

 # iterate, use a named column
 @time c3 = count_field_iterate_index(file, col_index, col_num)

 @assert c1 == c2 == c3
	######################################################################
	# MWE for tabulating large CSV files, using plain Julia and IterateTables
	# See the discussion at
	# https://discourse.julialang.org/t/iterate-delimited-file-as-namedtuples/3616/3
	#
	# Some simplifications:
	# - using the index of iterated tuples to make it comparable,
	# but would prefer to use the column name in real code.
	# - not using compressed files (because CSV does not support Zlib streams ATM)
	######################################################################

	using CSV
	using IterableTables

	######################################################################
	# generating fake data
	######################################################################

	"""
	Generate `lines` of fake data, overwriting `filename`.

	Each line has the given `columns`, filled with random integers, except
	for the `col_index` column which is filled with random
	`values`. `delim` is the delimiter.
	"""
	function gen_data(filename, lines, columns, col_index;
	values = string.('A':'Z'), delim = ';')
	@assert col_index ∈ 1:columns
	io = open(filename, "w")
	for i in 1:lines
	for j in 1:(col_index-1)
	print(io, rand(1:100000), ";")
	end
	print(io, rand(values))
	for j in (col_index+1):columns
	print(io, ";", rand(1:100000))
	end
	println(io)
	end
	close(io)
	end


	######################################################################
	# the dumb version
	######################################################################

	"""
	Read lines from `io`, call `f` on its fields as a vector.

	Important: `split` is used with `delim`. No fancy CSV escapes.

	# Keyword arguments:

	- when `progress > 0`, print a dot for that many lines read.
	- when `limit > 0`, only split up to that many fields.
	- `delim` is the delimiter.
	- when `maxlines > 0`, limit the number of lines read (for testing etc).
	"""
	function dolines(f, io;
	progress = 1000000, limit = 0, delim = ';', maxlines = 0)
	line = 0
	while !eof(io)
	if isa(progress, Integer) && line % progress == 0
	print(".")
	end
	line += 1
	if maxlines > 0 && line ≥ maxlines
	break
	end
	f(split(readline(io), delim; limit = limit))
	end
	end

	"Count the frequencies of strings in `col_index`, return as a `Dict`."
	function count_field(filename, col_index; options...)
	c = Dict{String,Int}()
	open(filename, "r") do io
	dolines(io;
	limit = col_index + 1, delim = ';', options...) do fields
	kind = fields[col_index]
	c[kind] = get(c, kind, 0) + 1
	end
	end
	c
	end

	######################################################################
	# using IterateTables and CSV
	######################################################################

	"""
	Same as `count_field` above, but using an iterator over lines, picking
	a fixed column called `AM`.
	"""
	function count_field_iterate_colname(itr)
	c = Dict{String, Int}()
	for i in itr
	field = i.AM # colum fixed
	if !isnull(field)
	kind = get(field)
	c[kind] = get(c, kind, 0) + 1
	end
	end
	c
	end

	function count_field_iterate_colname(filename::String,
	col_index, col_num; delim = ';')
	open(filename, "r") do io
	header = string.(1:col_num)
	header[col_index] = "AM"
	csv = CSV.Source(io, types=fill(String, col_num), delim = ';', header = header)
	itr = getiterator(csv)
	count_field_iterate_colname(itr)
	end
	end

	"""
	Same as `count_field` above, but using an iterator over lines, picking
	a column with a given index.
	"""
	function count_field_iterate_index(itr, col_index)
	c = Dict{String, Int}()
	for i in itr
	field = i[col_index]
	if !isnull(field)
	kind = get(field)
	c[kind] = get(c, kind, 0) + 1
	end
	end
	c
	end

	function count_field_iterate_index(filename::String,
	col_index, col_num; delim = ';')
	open(filename, "r") do io
	header = string.(1:col_num)
	csv = CSV.Source(io, types=fill(String, col_num), delim = ';',
	header = header)
	itr = getiterator(csv)
	count_field_iterate_index(itr, col_index)
	end
	end

	######################################################################
	# runtime code
	######################################################################

	file = tempname() * ".csv" # make sure you have enough space
	col_index = 5
	col_num = 20

	gen_data(file, 10^6, col_num, col_index) # try 10^8 for the actual data I was using

	# compilation time included, but runtime should dominate

	# using basic Julia
	@time c1 = count_field(file, col_index)

	# iterate, use a named column
	@time c2 = count_field_iterate_colname(file, col_index, col_num)

	# iterate, use a named column
	@time c3 = count_field_iterate_index(file, col_index, col_num)

	@assert c1 == c2 == c3
No results found