Skip to content

Instantly share code, notes, and snippets.

@hoblin
Created February 28, 2025 01:43
Show Gist options
  • Save hoblin/9b252c2bf3fe041ec73b5f34e7575f65 to your computer and use it in GitHub Desktop.
Save hoblin/9b252c2bf3fe041ec73b5f34e7575f65 to your computer and use it in GitHub Desktop.
Benchmark comparing 7 different methods for efficiently estimating CSV row counts without reading entire files, evaluating memory usage, speed, and accuracy to identify the optimal implementation.
#!/usr/bin/env ruby
require "benchmark"
require "benchmark/memory"
require "csv"
require "date"
# Configuration
SAMPLE_LINES = 1000
BLOCK_SIZE = 4 * 1024 * 1024
BUFFER_SIZE = 4096
MULTI_POINT_SAMPLES = 3
GC_FREQUENCY = 100
DEFAULT_TEST_ROWS = 500_000
TEST_FILE_COLUMNS = 30
TEST_FILE_NAME = "test_data.csv"
def generate_test_csv
headers = [
"Date", "Campaign", "Ad Group", "Keyword", "Match Type", "Impressions",
"Clicks", "CTR", "CPC", "Cost", "Conversions", "CPA", "ROAS"
]
headers << "Metric_#{headers.size + 1}" while headers.size < TEST_FILE_COLUMNS
headers = headers[0...TEST_FILE_COLUMNS]
CSV.open(TEST_FILE_NAME, 'w') do |csv|
csv << headers
DEFAULT_TEST_ROWS.times do |i|
row = [
(Date.today - rand(365)).to_s,
"Campaign_#{(i % 50) + 1}",
"AdGroup_#{(i % 200) + 1}",
"keyword_#{(i % 1000) + 1}",
["Exact", "Phrase", "Broad"].sample,
rand(10000),
rand(500),
"#{(rand * 10).round(2)}%",
"$#{(rand * 5).round(2)}",
"$#{(rand * 1000).round(2)}",
rand(20),
"$#{(rand * 100).round(2)}",
"#{(rand * 10).round(2)}x"
]
row << "value_#{rand(1000)}" while row.size < TEST_FILE_COLUMNS
csv << row
end
end
end
def count_actual_lines(file_path)
count = 0
File.foreach(file_path) { |_| count += 1 }
count
end
def estimate_beginning_sample(file_path)
file_size = File.size(file_path)
lines_read = 0
bytes_read = 0
File.open(file_path, 'r') do |file|
SAMPLE_LINES.times do
line = file.gets
break unless line
bytes_read += line.bytesize
lines_read += 1
end
end
return 0 if lines_read.zero?
avg_line_size = bytes_read.to_f / lines_read
(file_size / avg_line_size).to_i
end
def estimate_multi_point(file_path)
file_size = File.size(file_path)
samples = []
sample_size = SAMPLE_LINES / MULTI_POINT_SAMPLES
File.open(file_path, 'r') do |file|
MULTI_POINT_SAMPLES.times do |i|
position = i * (file_size / (MULTI_POINT_SAMPLES + 1))
position = 0 if i.zero?
file.seek(position)
file.gets if position > 0
lines_read = 0
bytes_read = 0
sample_size.times do
line = file.gets
break unless line
bytes_read += line.bytesize
lines_read += 1
end
samples << (lines_read > 0 ? bytes_read.to_f / lines_read : 0)
end
end
valid_samples = samples.reject(&:zero?)
return 0 if valid_samples.empty?
avg_line_size = valid_samples.sum / valid_samples.size
(file_size / avg_line_size).to_i
end
def estimate_block_based(file_path)
file_size = File.size(file_path)
block = File.read(file_path, BLOCK_SIZE)
newline_count = block.count("\n")
return 0 if newline_count.zero?
bytes_per_line = block.bytesize.to_f / newline_count
(file_size / bytes_per_line).to_i
end
def estimate_byte_counting(file_path)
file_size = File.size(file_path)
newline_count = 0
bytes_read = 0
File.open(file_path, 'r') do |file|
while buffer = file.read(BUFFER_SIZE)
newline_count += buffer.count("\n")
bytes_read += buffer.bytesize
break if bytes_read >= BLOCK_SIZE
end
end
return 0 if newline_count.zero?
(file_size.to_f / bytes_read * newline_count).to_i
end
def estimate_with_foreach_gc(file_path)
file_size = File.size(file_path)
bytes_read = 0
line_count = 0
File.open(file_path, 'r') do |file|
SAMPLE_LINES.times do |i|
line = file.gets
break unless line
bytes_read += line.bytesize
line_count += 1
GC.start if i % GC_FREQUENCY == 0
end
end
return 0 if line_count.zero?
avg_line_size = bytes_read.to_f / line_count
(file_size / avg_line_size).to_i
end
def estimate_with_each_char(file_path)
file_size = File.size(file_path)
newline_count = 0
bytes_read = 0
File.open(file_path, 'r') do |file|
char_count = 0
file.each_char do |char|
newline_count += 1 if char == "\n"
bytes_read += 1
char_count += 1
break if char_count >= BLOCK_SIZE
end
end
return 0 if newline_count.zero?
(file_size.to_f / bytes_read * newline_count).to_i
end
def estimate_with_lazy_enum(file_path)
file_size = File.size(file_path)
lines_enum = File.open(file_path).each_line.lazy
samples = lines_enum.take(SAMPLE_LINES).to_a
bytes_read = samples.sum(&:bytesize)
line_count = samples.size
return 0 if line_count.zero?
avg_line_size = bytes_read.to_f / line_count
(file_size / avg_line_size).to_i
end
def benchmark_memory_usage(file_path, methods)
puts "\nMemory Usage Benchmark:"
Benchmark.memory do |x|
methods.each do |name, method|
x.report(name) { method.call(file_path) }
end
x.compare!
end
end
def benchmark_speed(file_path, methods)
puts "\nSpeed Benchmark:"
Benchmark.bm(25) do |x|
methods.each do |name, method|
x.report(name) { method.call(file_path) }
end
end
end
def check_accuracy(file_path, methods)
file_size = File.size(file_path)
# Skip accuracy check for very large files
if file_size >= 100 * 1024 * 1024
puts "\nSkipping accuracy check for large file (#{file_size / 1024 / 1024} MB)"
return
end
puts "\nAccuracy Check:"
actual = count_actual_lines(file_path)
puts "Actual line count: #{actual}"
methods.each do |name, method|
estimate = method.call(file_path)
error = actual.zero? ? 0 : ((estimate - actual).abs.to_f / actual * 100).round(2)
puts "#{name}: #{estimate} (Error: #{error}%)"
end
end
generate_test_csv
puts "Generated test CSV file with #{DEFAULT_TEST_ROWS} rows and #{TEST_FILE_COLUMNS} columns: #{TEST_FILE_NAME}"
puts "File size: #{File.size(TEST_FILE_NAME)} bytes"
estimation_methods = {
"Beginning Sample" => method(:estimate_beginning_sample),
"Multi-Point" => method(:estimate_multi_point),
"Block-Based" => method(:estimate_block_based),
"Byte Counting" => method(:estimate_byte_counting),
"With GC (foreach)" => method(:estimate_with_foreach_gc),
"Each Char (limited)" => method(:estimate_with_each_char),
"Lazy Enumerator" => method(:estimate_with_lazy_enum)
}
puts "\nQuick Estimates:"
estimation_methods.each do |name, method|
puts "#{name}: #{method.call(TEST_FILE_NAME)} rows"
end
benchmark_memory_usage(TEST_FILE_NAME, estimation_methods)
benchmark_speed(TEST_FILE_NAME, estimation_methods)
check_accuracy(TEST_FILE_NAME, estimation_methods)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment