Skip to content

Instantly share code, notes, and snippets.

@Nezteb
Created June 6, 2026 06:07
Show Gist options
  • Select an option

  • Save Nezteb/02de1aec7b83d7dd20b669205ab0ecbe to your computer and use it in GitHub Desktop.

Select an option

Save Nezteb/02de1aec7b83d7dd20b669205ab0ecbe to your computer and use it in GitHub Desktop.
Elixir CSV parsing benchmark as a Livebook.

NimbleCSV Parsing Benchmark

Mix.install([
  {:nimble_csv, "~> 1.3"},
  {:explorer, "~> 0.11"},
  {:benchee, "~> 1.5"},
  {:kino_benchee, "~> 0.1"}
])

1. Generate CSV

defmodule BenchCSV do
  # Original input specification: ~250 MB, 35_000 rows, 2_000 cols
  @default_rows       35_000
  @default_cols       2_000
  @default_chunk_rows 1_000
  @default_filename   "nimble_bench.csv"

  def generate!(opts \\ []) do
    rows       = Keyword.get(opts, :rows,       @default_rows)
    cols       = Keyword.get(opts, :cols,       @default_cols)
    chunk_rows = Keyword.get(opts, :chunk_rows, @default_chunk_rows)
    dir        = Keyword.get(opts, :dir,        System.tmp_dir!())
    filename   = Keyword.get(opts, :filename,   @default_filename)

    path = Path.join(dir, filename)
    write!(path, rows, cols, chunk_rows)

    mb = path |> File.stat!() |> Map.get(:size) |> Kernel./(1_048_576) |> Float.round(1)
    IO.puts("#{mb} MB written to #{path}")

    path
  end

  defp write!(path, rows, cols, chunk_rows) do
    full_chunks = div(rows, chunk_rows)
    remainder   = rem(rows, chunk_rows)
    data_row    = build_data_row(cols)

    File.open!(path, [:write, :raw, :binary], fn f ->
      IO.binwrite(f, build_header(cols) <> "\n")
      if full_chunks > 0 do
        chunk = build_chunk(data_row, chunk_rows)
        for _ <- 1..full_chunks, do: IO.binwrite(f, chunk)
      end
      if remainder > 0, do: IO.binwrite(f, build_chunk(data_row, remainder))
    end)
  end

  defp build_header(cols), do: Enum.map_join(1..cols, ",", &"col#{&1}")

  # Each column gets a distinct 3-byte value so the BEAM can't over-optimise binary sharing
  defp build_data_row(cols) do
    Enum.map_join(1..cols, ",", fn i ->
      <<97 + rem(i, 26), 48 + rem(i, 10), 48 + rem(div(i, 10), 10)>>
    end)
  end

  defp build_chunk(data_row, n), do: Enum.map_join(1..n, "\n", fn _ -> data_row end) <> "\n"
end

BenchCSV.generate!()
267.0 MB written to /tmp/nimble_bench.csv
"/tmp/nimble_bench.csv"

2. Parsing strategies

defmodule Bench do
  @csv_path System.tmp_dir!() |> Path.join("nimble_bench.csv")
  @schedulers System.schedulers_online()

  # one contiguous binary, single pass, zero field allocation
  def parse_string do
    @csv_path
    |> File.read!()
    |> NimbleCSV.RFC4180.parse_string(skip_headers: true)
    |> Enum.each(&length/1)
  end

  # lazy, low memory, single-threaded
  # `read_ahead` avoids one syscall per line (nice for wide files)
  def parse_stream do
    @csv_path
    |> File.stream!(read_ahead: 1_000_000)
    |> NimbleCSV.RFC4180.parse_stream(skip_headers: true)
    |> Stream.each(&length/1)
    |> Stream.run()
  end

  # chunk split across all schedulers
  # `ordered: false` skips the merge step
  def parallel_parse_string do
    @csv_path
    |> File.read!()
    |> chunk_binary(@schedulers)
    |> Task.async_stream(
      &parse_chunk/1,
      max_concurrency: @schedulers,
      ordered: false,
      timeout: :infinity
    )
    |> Stream.run()
  end

  # Splitsbinary into n line-aligned chunks, header stripped
  defp chunk_binary(binary, n) do
    [_header | lines] = :binary.split(binary, "\n", [:global])

    lines
    |> Enum.chunk_every(max(1, div(length(lines), n)))
    |> Enum.map(&Enum.join(&1, "\n"))
  end

  defp parse_chunk(chunk) do
    chunk |> NimbleCSV.RFC4180.parse_string(skip_headers: false) |> Enum.each(&length/1)
  end

  def explorer_eager do
    @csv_path
    |> Explorer.DataFrame.from_csv!(infer_schema_length: 0)
    |> Explorer.DataFrame.n_rows()
  end

  def explorer_lazy do
    @csv_path
    |> Explorer.DataFrame.from_csv!(lazy: true, infer_schema_length: 0)
    |> Explorer.DataFrame.collect()
    |> Explorer.DataFrame.n_rows()
  end
end
{:module, Bench, <<70, 79, 82, 49, 0, 0, 17, ...>>, ...}

3. Benchmark

# warmup: 2 primes the OS page cache so all strategies see equal I/O.
# memory_time: 2 measures peak heap allocation (not RSS) per iteration.
Benchee.run(
  %{
    "NimbleCSV parse_string"          => &Bench.parse_string/0,
    "NimbleCSV parse_stream"          => &Bench.parse_stream/0,
    "NimbleCSV parallel parse_string" => &Bench.parallel_parse_string/0,
    "Explorer from_csv eager"         => &Bench.explorer_eager/0,
    "Explorer from_csv lazy"          => &Bench.explorer_lazy/0
  },
  warmup: 2,
  time: 10,
  memory_time: 2
)
Operating System: Linux
CPU Information: Unrecognized processor
Number of Available Cores: 8
Available memory: 58.34 GB
Elixir 1.19.3
Erlang 28.1.1
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 1 min 10 s
Excluding outliers: false

Benchmarking Explorer from_csv eager ...
Benchmarking Explorer from_csv lazy ...
Benchmarking NimbleCSV parallel parse_string ...
Benchmarking NimbleCSV parse_stream ...
Benchmarking NimbleCSV parse_string ...
Calculating statistics...
Formatting results...

Name                                      ips        average  deviation         median         99th %
Explorer from_csv eager                  3.92         0.26 s     ±2.00%         0.26 s         0.27 s
Explorer from_csv lazy                   3.87         0.26 s     ±2.64%         0.26 s         0.27 s
NimbleCSV parse_stream                   0.85         1.17 s     ±1.04%         1.17 s         1.20 s
NimbleCSV parallel parse_string          0.67         1.49 s     ±1.14%         1.49 s         1.52 s
NimbleCSV parse_string                  0.105         9.55 s     ±0.54%         9.55 s         9.58 s

Comparison: 
Explorer from_csv eager                  3.92
Explorer from_csv lazy                   3.87 - 1.01x slower +0.00267 s
NimbleCSV parse_stream                   0.85 - 4.59x slower +0.92 s
NimbleCSV parallel parse_string          0.67 - 5.85x slower +1.24 s
NimbleCSV parse_string                  0.105 - 37.38x slower +9.29 s

Memory usage statistics:

Name                                    average  deviation         median         99th %
Explorer from_csv eager                0.162 MB     ±0.00%       0.162 MB       0.162 MB
Explorer from_csv lazy                 0.162 MB     ±0.00%       0.162 MB       0.162 MB
NimbleCSV parse_stream                 26.03 MB     ±0.00%       26.03 MB       26.03 MB
NimbleCSV parallel parse_string         6.36 MB     ±0.00%        6.36 MB        6.36 MB
NimbleCSV parse_string               2673.38 MB     ±0.00%     2673.38 MB     2673.38 MB

Comparison: 
Explorer from_csv eager                0.162 MB
Explorer from_csv lazy                 0.162 MB - 1.00x memory usage +0.00037 MB
NimbleCSV parse_stream                 26.03 MB - 160.63x memory usage +25.87 MB
NimbleCSV parallel parse_string         6.36 MB - 39.28x memory usage +6.20 MB
NimbleCSV parse_string               2673.38 MB - 16499.00x memory usage +2673.22 MB
@Nezteb

Nezteb commented Jun 6, 2026

Copy link
Copy Markdown
Author
visualization

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment