Skip to content

Instantly share code, notes, and snippets.

@tpapp
Last active May 13, 2017 06:15
Show Gist options
  • Select an option

  • Save tpapp/fe96e258628d4c1b01e38fa8283f8760 to your computer and use it in GitHub Desktop.

Select an option

Save tpapp/fe96e258628d4c1b01e38fa8283f8760 to your computer and use it in GitHub Desktop.
######################################################################
# MWE for tabulating large CSV files, using plain Julia and IterateTables
# See the discussion at
# https://discourse.julialang.org/t/iterate-delimited-file-as-namedtuples/3616/3
#
# Some simplifications:
# - using the index of iterated tuples to make it comparable,
# but would prefer to use the column name in real code.
# - not using compressed files (because CSV does not support Zlib streams ATM)
######################################################################
using CSV
using IterableTables
######################################################################
# generating fake data
######################################################################
"""
Generate `lines` of fake data, overwriting `filename`.
Each line has the given `columns`, filled with random integers, except
for the `col_index` column which is filled with random
`values`. `delim` is the delimiter.
"""
function gen_data(filename, lines, columns, col_index;
values = string.('A':'Z'), delim = ';')
@assert col_index ∈ 1:columns
io = open(filename, "w")
for i in 1:lines
for j in 1:(col_index-1)
print(io, rand(1:100000), ";")
end
print(io, rand(values))
for j in (col_index+1):columns
print(io, ";", rand(1:100000))
end
println(io)
end
close(io)
end
######################################################################
# the dumb version
######################################################################
"""
Read lines from `io`, call `f` on its fields as a vector.
**Important**: `split` is used with `delim`. No fancy CSV escapes.
# Keyword arguments:
- when `progress > 0`, print a dot for that many lines read.
- when `limit > 0`, only split up to that many fields.
- `delim` is the delimiter.
- when `maxlines > 0`, limit the number of lines read (for testing etc).
"""
function dolines(f, io;
progress = 1000000, limit = 0, delim = ';', maxlines = 0)
line = 0
while !eof(io)
if isa(progress, Integer) && line % progress == 0
print(".")
end
line += 1
if maxlines > 0 && line ≥ maxlines
break
end
f(split(readline(io), delim; limit = limit))
end
end
"Count the frequencies of strings in `col_index`, return as a `Dict`."
function count_field(filename, col_index; options...)
c = Dict{String,Int}()
open(filename, "r") do io
dolines(io;
limit = col_index + 1, delim = ';', options...) do fields
kind = fields[col_index]
c[kind] = get(c, kind, 0) + 1
end
end
c
end
######################################################################
# using IterateTables and CSV
######################################################################
"""
Same as `count_field` above, but using an iterator over lines, picking
a fixed column called `AM`.
"""
function count_field_iterate_colname(itr)
c = Dict{String, Int}()
for i in itr
field = i.AM # colum fixed
if !isnull(field)
kind = get(field)
c[kind] = get(c, kind, 0) + 1
end
end
c
end
function count_field_iterate_colname(filename::String,
col_index, col_num; delim = ';')
open(filename, "r") do io
header = string.(1:col_num)
header[col_index] = "AM"
csv = CSV.Source(io, types=fill(String, col_num), delim = ';', header = header)
itr = getiterator(csv)
count_field_iterate_colname(itr)
end
end
"""
Same as `count_field` above, but using an iterator over lines, picking
a column with a given index.
"""
function count_field_iterate_index(itr, col_index)
c = Dict{String, Int}()
for i in itr
field = i[col_index]
if !isnull(field)
kind = get(field)
c[kind] = get(c, kind, 0) + 1
end
end
c
end
function count_field_iterate_index(filename::String,
col_index, col_num; delim = ';')
open(filename, "r") do io
header = string.(1:col_num)
csv = CSV.Source(io, types=fill(String, col_num), delim = ';',
header = header)
itr = getiterator(csv)
count_field_iterate_index(itr, col_index)
end
end
######################################################################
# runtime code
######################################################################
file = tempname() * ".csv" # make sure you have enough space
col_index = 5
col_num = 20
gen_data(file, 10^6, col_num, col_index) # try 10^8 for the actual data I was using
# compilation time included, but runtime should dominate
# using basic Julia
@time c1 = count_field(file, col_index)
# iterate, use a named column
@time c2 = count_field_iterate_colname(file, col_index, col_num)
# iterate, use a named column
@time c3 = count_field_iterate_index(file, col_index, col_num)
@assert c1 == c2 == c3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment