Last active
May 13, 2017 06:15
-
-
Save tpapp/fe96e258628d4c1b01e38fa8283f8760 to your computer and use it in GitHub Desktop.
MWE for discussion at https://discourse.julialang.org/t/iterate-delimited-file-as-namedtuples/3616/3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ###################################################################### | |
| # MWE for tabulating large CSV files, using plain Julia and IterateTables | |
| # See the discussion at | |
| # https://discourse.julialang.org/t/iterate-delimited-file-as-namedtuples/3616/3 | |
| # | |
| # Some simplifications: | |
| # - using the index of iterated tuples to make it comparable, | |
| # but would prefer to use the column name in real code. | |
| # - not using compressed files (because CSV does not support Zlib streams ATM) | |
| ###################################################################### | |
| using CSV | |
| using IterableTables | |
| ###################################################################### | |
| # generating fake data | |
| ###################################################################### | |
| """ | |
| Generate `lines` of fake data, overwriting `filename`. | |
| Each line has the given `columns`, filled with random integers, except | |
| for the `col_index` column which is filled with random | |
| `values`. `delim` is the delimiter. | |
| """ | |
| function gen_data(filename, lines, columns, col_index; | |
| values = string.('A':'Z'), delim = ';') | |
| @assert col_index ∈ 1:columns | |
| io = open(filename, "w") | |
| for i in 1:lines | |
| for j in 1:(col_index-1) | |
| print(io, rand(1:100000), ";") | |
| end | |
| print(io, rand(values)) | |
| for j in (col_index+1):columns | |
| print(io, ";", rand(1:100000)) | |
| end | |
| println(io) | |
| end | |
| close(io) | |
| end | |
| ###################################################################### | |
| # the dumb version | |
| ###################################################################### | |
| """ | |
| Read lines from `io`, call `f` on its fields as a vector. | |
| **Important**: `split` is used with `delim`. No fancy CSV escapes. | |
| # Keyword arguments: | |
| - when `progress > 0`, print a dot for that many lines read. | |
| - when `limit > 0`, only split up to that many fields. | |
| - `delim` is the delimiter. | |
| - when `maxlines > 0`, limit the number of lines read (for testing etc). | |
| """ | |
| function dolines(f, io; | |
| progress = 1000000, limit = 0, delim = ';', maxlines = 0) | |
| line = 0 | |
| while !eof(io) | |
| if isa(progress, Integer) && line % progress == 0 | |
| print(".") | |
| end | |
| line += 1 | |
| if maxlines > 0 && line ≥ maxlines | |
| break | |
| end | |
| f(split(readline(io), delim; limit = limit)) | |
| end | |
| end | |
| "Count the frequencies of strings in `col_index`, return as a `Dict`." | |
| function count_field(filename, col_index; options...) | |
| c = Dict{String,Int}() | |
| open(filename, "r") do io | |
| dolines(io; | |
| limit = col_index + 1, delim = ';', options...) do fields | |
| kind = fields[col_index] | |
| c[kind] = get(c, kind, 0) + 1 | |
| end | |
| end | |
| c | |
| end | |
| ###################################################################### | |
| # using IterateTables and CSV | |
| ###################################################################### | |
| """ | |
| Same as `count_field` above, but using an iterator over lines, picking | |
| a fixed column called `AM`. | |
| """ | |
| function count_field_iterate_colname(itr) | |
| c = Dict{String, Int}() | |
| for i in itr | |
| field = i.AM # colum fixed | |
| if !isnull(field) | |
| kind = get(field) | |
| c[kind] = get(c, kind, 0) + 1 | |
| end | |
| end | |
| c | |
| end | |
| function count_field_iterate_colname(filename::String, | |
| col_index, col_num; delim = ';') | |
| open(filename, "r") do io | |
| header = string.(1:col_num) | |
| header[col_index] = "AM" | |
| csv = CSV.Source(io, types=fill(String, col_num), delim = ';', header = header) | |
| itr = getiterator(csv) | |
| count_field_iterate_colname(itr) | |
| end | |
| end | |
| """ | |
| Same as `count_field` above, but using an iterator over lines, picking | |
| a column with a given index. | |
| """ | |
| function count_field_iterate_index(itr, col_index) | |
| c = Dict{String, Int}() | |
| for i in itr | |
| field = i[col_index] | |
| if !isnull(field) | |
| kind = get(field) | |
| c[kind] = get(c, kind, 0) + 1 | |
| end | |
| end | |
| c | |
| end | |
| function count_field_iterate_index(filename::String, | |
| col_index, col_num; delim = ';') | |
| open(filename, "r") do io | |
| header = string.(1:col_num) | |
| csv = CSV.Source(io, types=fill(String, col_num), delim = ';', | |
| header = header) | |
| itr = getiterator(csv) | |
| count_field_iterate_index(itr, col_index) | |
| end | |
| end | |
| ###################################################################### | |
| # runtime code | |
| ###################################################################### | |
| file = tempname() * ".csv" # make sure you have enough space | |
| col_index = 5 | |
| col_num = 20 | |
| gen_data(file, 10^6, col_num, col_index) # try 10^8 for the actual data I was using | |
| # compilation time included, but runtime should dominate | |
| # using basic Julia | |
| @time c1 = count_field(file, col_index) | |
| # iterate, use a named column | |
| @time c2 = count_field_iterate_colname(file, col_index, col_num) | |
| # iterate, use a named column | |
| @time c3 = count_field_iterate_index(file, col_index, col_num) | |
| @assert c1 == c2 == c3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment