Created
February 28, 2025 01:43
-
-
Save hoblin/9b252c2bf3fe041ec73b5f34e7575f65 to your computer and use it in GitHub Desktop.
Benchmark comparing 7 different methods for efficiently estimating CSV row counts without reading entire files, evaluating memory usage, speed, and accuracy to identify the optimal implementation.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require "benchmark" | |
require "benchmark/memory" | |
require "csv" | |
require "date" | |
# Configuration | |
SAMPLE_LINES = 1000 | |
BLOCK_SIZE = 4 * 1024 * 1024 | |
BUFFER_SIZE = 4096 | |
MULTI_POINT_SAMPLES = 3 | |
GC_FREQUENCY = 100 | |
DEFAULT_TEST_ROWS = 500_000 | |
TEST_FILE_COLUMNS = 30 | |
TEST_FILE_NAME = "test_data.csv" | |
def generate_test_csv | |
headers = [ | |
"Date", "Campaign", "Ad Group", "Keyword", "Match Type", "Impressions", | |
"Clicks", "CTR", "CPC", "Cost", "Conversions", "CPA", "ROAS" | |
] | |
headers << "Metric_#{headers.size + 1}" while headers.size < TEST_FILE_COLUMNS | |
headers = headers[0...TEST_FILE_COLUMNS] | |
CSV.open(TEST_FILE_NAME, 'w') do |csv| | |
csv << headers | |
DEFAULT_TEST_ROWS.times do |i| | |
row = [ | |
(Date.today - rand(365)).to_s, | |
"Campaign_#{(i % 50) + 1}", | |
"AdGroup_#{(i % 200) + 1}", | |
"keyword_#{(i % 1000) + 1}", | |
["Exact", "Phrase", "Broad"].sample, | |
rand(10000), | |
rand(500), | |
"#{(rand * 10).round(2)}%", | |
"$#{(rand * 5).round(2)}", | |
"$#{(rand * 1000).round(2)}", | |
rand(20), | |
"$#{(rand * 100).round(2)}", | |
"#{(rand * 10).round(2)}x" | |
] | |
row << "value_#{rand(1000)}" while row.size < TEST_FILE_COLUMNS | |
csv << row | |
end | |
end | |
end | |
def count_actual_lines(file_path) | |
count = 0 | |
File.foreach(file_path) { |_| count += 1 } | |
count | |
end | |
def estimate_beginning_sample(file_path) | |
file_size = File.size(file_path) | |
lines_read = 0 | |
bytes_read = 0 | |
File.open(file_path, 'r') do |file| | |
SAMPLE_LINES.times do | |
line = file.gets | |
break unless line | |
bytes_read += line.bytesize | |
lines_read += 1 | |
end | |
end | |
return 0 if lines_read.zero? | |
avg_line_size = bytes_read.to_f / lines_read | |
(file_size / avg_line_size).to_i | |
end | |
def estimate_multi_point(file_path) | |
file_size = File.size(file_path) | |
samples = [] | |
sample_size = SAMPLE_LINES / MULTI_POINT_SAMPLES | |
File.open(file_path, 'r') do |file| | |
MULTI_POINT_SAMPLES.times do |i| | |
position = i * (file_size / (MULTI_POINT_SAMPLES + 1)) | |
position = 0 if i.zero? | |
file.seek(position) | |
file.gets if position > 0 | |
lines_read = 0 | |
bytes_read = 0 | |
sample_size.times do | |
line = file.gets | |
break unless line | |
bytes_read += line.bytesize | |
lines_read += 1 | |
end | |
samples << (lines_read > 0 ? bytes_read.to_f / lines_read : 0) | |
end | |
end | |
valid_samples = samples.reject(&:zero?) | |
return 0 if valid_samples.empty? | |
avg_line_size = valid_samples.sum / valid_samples.size | |
(file_size / avg_line_size).to_i | |
end | |
def estimate_block_based(file_path) | |
file_size = File.size(file_path) | |
block = File.read(file_path, BLOCK_SIZE) | |
newline_count = block.count("\n") | |
return 0 if newline_count.zero? | |
bytes_per_line = block.bytesize.to_f / newline_count | |
(file_size / bytes_per_line).to_i | |
end | |
def estimate_byte_counting(file_path) | |
file_size = File.size(file_path) | |
newline_count = 0 | |
bytes_read = 0 | |
File.open(file_path, 'r') do |file| | |
while buffer = file.read(BUFFER_SIZE) | |
newline_count += buffer.count("\n") | |
bytes_read += buffer.bytesize | |
break if bytes_read >= BLOCK_SIZE | |
end | |
end | |
return 0 if newline_count.zero? | |
(file_size.to_f / bytes_read * newline_count).to_i | |
end | |
def estimate_with_foreach_gc(file_path) | |
file_size = File.size(file_path) | |
bytes_read = 0 | |
line_count = 0 | |
File.open(file_path, 'r') do |file| | |
SAMPLE_LINES.times do |i| | |
line = file.gets | |
break unless line | |
bytes_read += line.bytesize | |
line_count += 1 | |
GC.start if i % GC_FREQUENCY == 0 | |
end | |
end | |
return 0 if line_count.zero? | |
avg_line_size = bytes_read.to_f / line_count | |
(file_size / avg_line_size).to_i | |
end | |
def estimate_with_each_char(file_path) | |
file_size = File.size(file_path) | |
newline_count = 0 | |
bytes_read = 0 | |
File.open(file_path, 'r') do |file| | |
char_count = 0 | |
file.each_char do |char| | |
newline_count += 1 if char == "\n" | |
bytes_read += 1 | |
char_count += 1 | |
break if char_count >= BLOCK_SIZE | |
end | |
end | |
return 0 if newline_count.zero? | |
(file_size.to_f / bytes_read * newline_count).to_i | |
end | |
def estimate_with_lazy_enum(file_path) | |
file_size = File.size(file_path) | |
lines_enum = File.open(file_path).each_line.lazy | |
samples = lines_enum.take(SAMPLE_LINES).to_a | |
bytes_read = samples.sum(&:bytesize) | |
line_count = samples.size | |
return 0 if line_count.zero? | |
avg_line_size = bytes_read.to_f / line_count | |
(file_size / avg_line_size).to_i | |
end | |
def benchmark_memory_usage(file_path, methods) | |
puts "\nMemory Usage Benchmark:" | |
Benchmark.memory do |x| | |
methods.each do |name, method| | |
x.report(name) { method.call(file_path) } | |
end | |
x.compare! | |
end | |
end | |
def benchmark_speed(file_path, methods) | |
puts "\nSpeed Benchmark:" | |
Benchmark.bm(25) do |x| | |
methods.each do |name, method| | |
x.report(name) { method.call(file_path) } | |
end | |
end | |
end | |
def check_accuracy(file_path, methods) | |
file_size = File.size(file_path) | |
# Skip accuracy check for very large files | |
if file_size >= 100 * 1024 * 1024 | |
puts "\nSkipping accuracy check for large file (#{file_size / 1024 / 1024} MB)" | |
return | |
end | |
puts "\nAccuracy Check:" | |
actual = count_actual_lines(file_path) | |
puts "Actual line count: #{actual}" | |
methods.each do |name, method| | |
estimate = method.call(file_path) | |
error = actual.zero? ? 0 : ((estimate - actual).abs.to_f / actual * 100).round(2) | |
puts "#{name}: #{estimate} (Error: #{error}%)" | |
end | |
end | |
generate_test_csv | |
puts "Generated test CSV file with #{DEFAULT_TEST_ROWS} rows and #{TEST_FILE_COLUMNS} columns: #{TEST_FILE_NAME}" | |
puts "File size: #{File.size(TEST_FILE_NAME)} bytes" | |
estimation_methods = { | |
"Beginning Sample" => method(:estimate_beginning_sample), | |
"Multi-Point" => method(:estimate_multi_point), | |
"Block-Based" => method(:estimate_block_based), | |
"Byte Counting" => method(:estimate_byte_counting), | |
"With GC (foreach)" => method(:estimate_with_foreach_gc), | |
"Each Char (limited)" => method(:estimate_with_each_char), | |
"Lazy Enumerator" => method(:estimate_with_lazy_enum) | |
} | |
puts "\nQuick Estimates:" | |
estimation_methods.each do |name, method| | |
puts "#{name}: #{method.call(TEST_FILE_NAME)} rows" | |
end | |
benchmark_memory_usage(TEST_FILE_NAME, estimation_methods) | |
benchmark_speed(TEST_FILE_NAME, estimation_methods) | |
check_accuracy(TEST_FILE_NAME, estimation_methods) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment