NimbleCSV Parsing Benchmark

Mix.install([
  {:nimble_csv, "~> 1.3"},
  {:explorer, "~> 0.11"},
  {:benchee, "~> 1.5"},
  {:kino_benchee, "~> 0.1"}
])

1. Generate CSV

defmodule BenchCSV do
  # Original input specification: ~250 MB, 35_000 rows, 2_000 cols
  @default_rows       35_000
  @default_cols       2_000
  @default_chunk_rows 1_000
  @default_filename   "nimble_bench.csv"

  def generate!(opts \\ []) do
    rows       = Keyword.get(opts, :rows,       @default_rows)
    cols       = Keyword.get(opts, :cols,       @default_cols)
    chunk_rows = Keyword.get(opts, :chunk_rows, @default_chunk_rows)
    dir        = Keyword.get(opts, :dir,        System.tmp_dir!())
    filename   = Keyword.get(opts, :filename,   @default_filename)

    path = Path.join(dir, filename)
    write!(path, rows, cols, chunk_rows)

    mb = path |> File.stat!() |> Map.get(:size) |> Kernel./(1_048_576) |> Float.round(1)
    IO.puts("#{mb} MB written to #{path}")

    path
  end

  defp write!(path, rows, cols, chunk_rows) do
    full_chunks = div(rows, chunk_rows)
    remainder   = rem(rows, chunk_rows)
    data_row    = build_data_row(cols)

    File.open!(path, [:write, :raw, :binary], fn f ->
      IO.binwrite(f, build_header(cols) <> "\n")
      if full_chunks > 0 do
        chunk = build_chunk(data_row, chunk_rows)
        for _ <- 1..full_chunks, do: IO.binwrite(f, chunk)
      end
      if remainder > 0, do: IO.binwrite(f, build_chunk(data_row, remainder))
    end)
  end

  defp build_header(cols), do: Enum.map_join(1..cols, ",", &"col#{&1}")

  # Each column gets a distinct 3-byte value so the BEAM can't over-optimise binary sharing
  defp build_data_row(cols) do
    Enum.map_join(1..cols, ",", fn i ->
      <<97 + rem(i, 26), 48 + rem(i, 10), 48 + rem(div(i, 10), 10)>>
    end)
  end

  defp build_chunk(data_row, n), do: Enum.map_join(1..n, "\n", fn _ -> data_row end) <> "\n"
end

BenchCSV.generate!()

267.0 MB written to /tmp/nimble_bench.csv

"/tmp/nimble_bench.csv"

2. Parsing strategies

defmodule Bench do
  @csv_path System.tmp_dir!() |> Path.join("nimble_bench.csv")
  @schedulers System.schedulers_online()

  # one contiguous binary, single pass, zero field allocation
  def parse_string do
    @csv_path
    |> File.read!()
    |> NimbleCSV.RFC4180.parse_string(skip_headers: true)
    |> Enum.each(&length/1)
  end

  # lazy, low memory, single-threaded
  # `read_ahead` avoids one syscall per line (nice for wide files)
  def parse_stream do
    @csv_path
    |> File.stream!(read_ahead: 1_000_000)
    |> NimbleCSV.RFC4180.parse_stream(skip_headers: true)
    |> Stream.each(&length/1)
    |> Stream.run()
  end

  # chunk split across all schedulers
  # `ordered: false` skips the merge step
  def parallel_parse_string do
    @csv_path
    |> File.read!()
    |> chunk_binary(@schedulers)
    |> Task.async_stream(
      &parse_chunk/1,
      max_concurrency: @schedulers,
      ordered: false,
      timeout: :infinity
    )
    |> Stream.run()
  end

  # Splitsbinary into n line-aligned chunks, header stripped
  defp chunk_binary(binary, n) do
    [_header | lines] = :binary.split(binary, "\n", [:global])

    lines
    |> Enum.chunk_every(max(1, div(length(lines), n)))
    |> Enum.map(&Enum.join(&1, "\n"))
  end

  defp parse_chunk(chunk) do
    chunk |> NimbleCSV.RFC4180.parse_string(skip_headers: false) |> Enum.each(&length/1)
  end

  def explorer_eager do
    @csv_path
    |> Explorer.DataFrame.from_csv!(infer_schema_length: 0)
    |> Explorer.DataFrame.n_rows()
  end

  def explorer_lazy do
    @csv_path
    |> Explorer.DataFrame.from_csv!(lazy: true, infer_schema_length: 0)
    |> Explorer.DataFrame.collect()
    |> Explorer.DataFrame.n_rows()
  end
end

{:module, Bench, <<70, 79, 82, 49, 0, 0, 17, ...>>, ...}

3. Benchmark

# warmup: 2 primes the OS page cache so all strategies see equal I/O.
# memory_time: 2 measures peak heap allocation (not RSS) per iteration.
Benchee.run(
  %{
    "NimbleCSV parse_string"          => &Bench.parse_string/0,
    "NimbleCSV parse_stream"          => &Bench.parse_stream/0,
    "NimbleCSV parallel parse_string" => &Bench.parallel_parse_string/0,
    "Explorer from_csv eager"         => &Bench.explorer_eager/0,
    "Explorer from_csv lazy"          => &Bench.explorer_lazy/0
  },
  warmup: 2,
  time: 10,
  memory_time: 2
)

Operating System: Linux
CPU Information: Unrecognized processor
Number of Available Cores: 8
Available memory: 58.34 GB
Elixir 1.19.3
Erlang 28.1.1
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
reduction time: 0 ns
parallel: 1
inputs: none specified
Estimated total run time: 1 min 10 s
Excluding outliers: false

Benchmarking Explorer from_csv eager ...
Benchmarking Explorer from_csv lazy ...
Benchmarking NimbleCSV parallel parse_string ...
Benchmarking NimbleCSV parse_stream ...
Benchmarking NimbleCSV parse_string ...
Calculating statistics...
Formatting results...

Name                                      ips        average  deviation         median         99th %
Explorer from_csv eager                  3.92         0.26 s     ±2.00%         0.26 s         0.27 s
Explorer from_csv lazy                   3.87         0.26 s     ±2.64%         0.26 s         0.27 s
NimbleCSV parse_stream                   0.85         1.17 s     ±1.04%         1.17 s         1.20 s
NimbleCSV parallel parse_string          0.67         1.49 s     ±1.14%         1.49 s         1.52 s
NimbleCSV parse_string                  0.105         9.55 s     ±0.54%         9.55 s         9.58 s

Comparison: 
Explorer from_csv eager                  3.92
Explorer from_csv lazy                   3.87 - 1.01x slower +0.00267 s
NimbleCSV parse_stream                   0.85 - 4.59x slower +0.92 s
NimbleCSV parallel parse_string          0.67 - 5.85x slower +1.24 s
NimbleCSV parse_string                  0.105 - 37.38x slower +9.29 s

Memory usage statistics:

Name                                    average  deviation         median         99th %
Explorer from_csv eager                0.162 MB     ±0.00%       0.162 MB       0.162 MB
Explorer from_csv lazy                 0.162 MB     ±0.00%       0.162 MB       0.162 MB
NimbleCSV parse_stream                 26.03 MB     ±0.00%       26.03 MB       26.03 MB
NimbleCSV parallel parse_string         6.36 MB     ±0.00%        6.36 MB        6.36 MB
NimbleCSV parse_string               2673.38 MB     ±0.00%     2673.38 MB     2673.38 MB

Comparison: 
Explorer from_csv eager                0.162 MB
Explorer from_csv lazy                 0.162 MB - 1.00x memory usage +0.00037 MB
NimbleCSV parse_stream                 26.03 MB - 160.63x memory usage +25.87 MB
NimbleCSV parallel parse_string         6.36 MB - 39.28x memory usage +6.20 MB
NimbleCSV parse_string               2673.38 MB - 16499.00x memory usage +2673.22 MB

Nezteb/csv_parsing_benchmark.livemd

Select an option

No results found

Select an option

No results found

NimbleCSV Parsing Benchmark

1. Generate CSV

2. Parsing strategies

3. Benchmark

Nezteb commented Jun 6, 2026

Uh oh!