hoblin · February 28, 2025 01:43
diff --git a/benchmark.rb b/benchmark.rb
 #!/usr/bin/env ruby
 require "benchmark"
 require "benchmark/memory"
 require "csv"
 require "date"

 # Configuration
 SAMPLE_LINES = 1000
 BLOCK_SIZE = 4 * 1024 * 1024
 BUFFER_SIZE = 4096
 MULTI_POINT_SAMPLES = 3
 GC_FREQUENCY = 100
 DEFAULT_TEST_ROWS = 500_000
 TEST_FILE_COLUMNS = 30
 TEST_FILE_NAME = "test_data.csv"

 def generate_test_csv
  headers = [
    "Date", "Campaign", "Ad Group", "Keyword", "Match Type", "Impressions",
    "Clicks", "CTR", "CPC", "Cost", "Conversions", "CPA", "ROAS"
  ]

  headers << "Metric_#{headers.size + 1}" while headers.size < TEST_FILE_COLUMNS
  headers = headers[0...TEST_FILE_COLUMNS]

  CSV.open(TEST_FILE_NAME, 'w') do |csv|
    csv << headers

    DEFAULT_TEST_ROWS.times do |i|
      row = [
        (Date.today - rand(365)).to_s,
        "Campaign_#{(i % 50) + 1}",
        "AdGroup_#{(i % 200) + 1}",
        "keyword_#{(i % 1000) + 1}",
        ["Exact", "Phrase", "Broad"].sample,
        rand(10000),
        rand(500),
        "#{(rand * 10).round(2)}%",
        "$#{(rand * 5).round(2)}",
        "$#{(rand * 1000).round(2)}",
        rand(20),
        "$#{(rand * 100).round(2)}",
        "#{(rand * 10).round(2)}x"
      ]

      row << "value_#{rand(1000)}" while row.size < TEST_FILE_COLUMNS
      csv << row
    end
  end
 end

 def count_actual_lines(file_path)
  count = 0
  File.foreach(file_path) { |_| count += 1 }
  count
 end

 def estimate_beginning_sample(file_path)
  file_size = File.size(file_path)
  lines_read = 0
  bytes_read = 0

  File.open(file_path, 'r') do |file|
    SAMPLE_LINES.times do
      line = file.gets
      break unless line
      bytes_read += line.bytesize
      lines_read += 1
    end
  end

  return 0 if lines_read.zero?
  avg_line_size = bytes_read.to_f / lines_read
  (file_size / avg_line_size).to_i
 end

 def estimate_multi_point(file_path)
  file_size = File.size(file_path)
  samples = []
  sample_size = SAMPLE_LINES / MULTI_POINT_SAMPLES

  File.open(file_path, 'r') do |file|
    MULTI_POINT_SAMPLES.times do |i|
      position = i * (file_size / (MULTI_POINT_SAMPLES + 1))
      position = 0 if i.zero?

      file.seek(position)
      file.gets if position > 0

      lines_read = 0
      bytes_read = 0

      sample_size.times do
        line = file.gets
        break unless line
        bytes_read += line.bytesize
        lines_read += 1
      end

      samples << (lines_read > 0 ? bytes_read.to_f / lines_read : 0)
    end
  end

  valid_samples = samples.reject(&:zero?)
  return 0 if valid_samples.empty?

  avg_line_size = valid_samples.sum / valid_samples.size
  (file_size / avg_line_size).to_i
 end

 def estimate_block_based(file_path)
  file_size = File.size(file_path)
  block = File.read(file_path, BLOCK_SIZE)
  newline_count = block.count("\n")

  return 0 if newline_count.zero?
  bytes_per_line = block.bytesize.to_f / newline_count
  (file_size / bytes_per_line).to_i
 end

 def estimate_byte_counting(file_path)
  file_size = File.size(file_path)
  newline_count = 0
  bytes_read = 0

  File.open(file_path, 'r') do |file|
    while buffer = file.read(BUFFER_SIZE)
      newline_count += buffer.count("\n")
      bytes_read += buffer.bytesize
      break if bytes_read >= BLOCK_SIZE
    end
  end

  return 0 if newline_count.zero?
  (file_size.to_f / bytes_read * newline_count).to_i
 end

 def estimate_with_foreach_gc(file_path)
  file_size = File.size(file_path)
  bytes_read = 0
  line_count = 0

  File.open(file_path, 'r') do |file|
    SAMPLE_LINES.times do |i|
      line = file.gets
      break unless line
      bytes_read += line.bytesize
      line_count += 1
      GC.start if i % GC_FREQUENCY == 0
    end
  end

  return 0 if line_count.zero?
  avg_line_size = bytes_read.to_f / line_count
  (file_size / avg_line_size).to_i
 end

 def estimate_with_each_char(file_path)
  file_size = File.size(file_path)
  newline_count = 0
  bytes_read = 0

  File.open(file_path, 'r') do |file|
    char_count = 0
    file.each_char do |char|
      newline_count += 1 if char == "\n"
      bytes_read += 1
      char_count += 1
      break if char_count >= BLOCK_SIZE
    end
  end

  return 0 if newline_count.zero?
  (file_size.to_f / bytes_read * newline_count).to_i
 end

 def estimate_with_lazy_enum(file_path)
  file_size = File.size(file_path)
  lines_enum = File.open(file_path).each_line.lazy
  samples = lines_enum.take(SAMPLE_LINES).to_a

  bytes_read = samples.sum(&:bytesize)
  line_count = samples.size

  return 0 if line_count.zero?
  avg_line_size = bytes_read.to_f / line_count
  (file_size / avg_line_size).to_i
 end

 def benchmark_memory_usage(file_path, methods)
  puts "\nMemory Usage Benchmark:"
  Benchmark.memory do |x|
    methods.each do |name, method|
      x.report(name) { method.call(file_path) }
    end
    x.compare!
  end
 end

 def benchmark_speed(file_path, methods)
  puts "\nSpeed Benchmark:"
  Benchmark.bm(25) do |x|
    methods.each do |name, method|
      x.report(name) { method.call(file_path) }
    end
  end
 end

 def check_accuracy(file_path, methods)
  file_size = File.size(file_path)

  # Skip accuracy check for very large files
  if file_size >= 100 * 1024 * 1024
    puts "\nSkipping accuracy check for large file (#{file_size / 1024 / 1024} MB)"
    return
  end

  puts "\nAccuracy Check:"
  actual = count_actual_lines(file_path)
  puts "Actual line count: #{actual}"

  methods.each do |name, method|
    estimate = method.call(file_path)
    error = actual.zero? ? 0 : ((estimate - actual).abs.to_f / actual * 100).round(2)
    puts "#{name}: #{estimate} (Error: #{error}%)"
  end
 end

 generate_test_csv

 puts "Generated test CSV file with #{DEFAULT_TEST_ROWS} rows and #{TEST_FILE_COLUMNS} columns: #{TEST_FILE_NAME}"
 puts "File size: #{File.size(TEST_FILE_NAME)} bytes"

 estimation_methods = {
  "Beginning Sample" => method(:estimate_beginning_sample),
  "Multi-Point" => method(:estimate_multi_point),
  "Block-Based" => method(:estimate_block_based),
  "Byte Counting" => method(:estimate_byte_counting),
  "With GC (foreach)" => method(:estimate_with_foreach_gc),
  "Each Char (limited)" => method(:estimate_with_each_char),
  "Lazy Enumerator" => method(:estimate_with_lazy_enum)
 }

 puts "\nQuick Estimates:"
 estimation_methods.each do |name, method|
  puts "#{name}: #{method.call(TEST_FILE_NAME)} rows"
 end

 benchmark_memory_usage(TEST_FILE_NAME, estimation_methods)
 benchmark_speed(TEST_FILE_NAME, estimation_methods)
 check_accuracy(TEST_FILE_NAME, estimation_methods)
	#!/usr/bin/env ruby
	require "benchmark"
	require "benchmark/memory"
	require "csv"
	require "date"

	# Configuration
	SAMPLE_LINES = 1000
	BLOCK_SIZE = 4 * 1024 * 1024
	BUFFER_SIZE = 4096
	MULTI_POINT_SAMPLES = 3
	GC_FREQUENCY = 100
	DEFAULT_TEST_ROWS = 500_000
	TEST_FILE_COLUMNS = 30
	TEST_FILE_NAME = "test_data.csv"

	def generate_test_csv
	headers = [
	"Date", "Campaign", "Ad Group", "Keyword", "Match Type", "Impressions",
	"Clicks", "CTR", "CPC", "Cost", "Conversions", "CPA", "ROAS"
	]

	headers << "Metric_#{headers.size + 1}" while headers.size < TEST_FILE_COLUMNS
	headers = headers[0...TEST_FILE_COLUMNS]

	CSV.open(TEST_FILE_NAME, 'w') do \|csv\|
	csv << headers

	DEFAULT_TEST_ROWS.times do \|i\|
	row = [
	(Date.today - rand(365)).to_s,
	"Campaign_#{(i % 50) + 1}",
	"AdGroup_#{(i % 200) + 1}",
	"keyword_#{(i % 1000) + 1}",
	["Exact", "Phrase", "Broad"].sample,
	rand(10000),
	rand(500),
	"#{(rand * 10).round(2)}%",
	"$#{(rand * 5).round(2)}",
	"$#{(rand * 1000).round(2)}",
	rand(20),
	"$#{(rand * 100).round(2)}",
	"#{(rand * 10).round(2)}x"
	]

	row << "value_#{rand(1000)}" while row.size < TEST_FILE_COLUMNS
	csv << row
	end
	end
	end

	def count_actual_lines(file_path)
	count = 0
	File.foreach(file_path) { \|_\| count += 1 }
	count
	end

	def estimate_beginning_sample(file_path)
	file_size = File.size(file_path)
	lines_read = 0
	bytes_read = 0

	File.open(file_path, 'r') do \|file\|
	SAMPLE_LINES.times do
	line = file.gets
	break unless line
	bytes_read += line.bytesize
	lines_read += 1
	end
	end

	return 0 if lines_read.zero?
	avg_line_size = bytes_read.to_f / lines_read
	(file_size / avg_line_size).to_i
	end

	def estimate_multi_point(file_path)
	file_size = File.size(file_path)
	samples = []
	sample_size = SAMPLE_LINES / MULTI_POINT_SAMPLES

	File.open(file_path, 'r') do \|file\|
	MULTI_POINT_SAMPLES.times do \|i\|
	position = i * (file_size / (MULTI_POINT_SAMPLES + 1))
	position = 0 if i.zero?

	file.seek(position)
	file.gets if position > 0

	lines_read = 0
	bytes_read = 0

	sample_size.times do
	line = file.gets
	break unless line
	bytes_read += line.bytesize
	lines_read += 1
	end

	samples << (lines_read > 0 ? bytes_read.to_f / lines_read : 0)
	end
	end

	valid_samples = samples.reject(&:zero?)
	return 0 if valid_samples.empty?

	avg_line_size = valid_samples.sum / valid_samples.size
	(file_size / avg_line_size).to_i
	end

	def estimate_block_based(file_path)
	file_size = File.size(file_path)
	block = File.read(file_path, BLOCK_SIZE)
	newline_count = block.count("\n")

	return 0 if newline_count.zero?
	bytes_per_line = block.bytesize.to_f / newline_count
	(file_size / bytes_per_line).to_i
	end

	def estimate_byte_counting(file_path)
	file_size = File.size(file_path)
	newline_count = 0
	bytes_read = 0

	File.open(file_path, 'r') do \|file\|
	while buffer = file.read(BUFFER_SIZE)
	newline_count += buffer.count("\n")
	bytes_read += buffer.bytesize
	break if bytes_read >= BLOCK_SIZE
	end
	end

	return 0 if newline_count.zero?
	(file_size.to_f / bytes_read * newline_count).to_i
	end

	def estimate_with_foreach_gc(file_path)
	file_size = File.size(file_path)
	bytes_read = 0
	line_count = 0

	File.open(file_path, 'r') do \|file\|
	SAMPLE_LINES.times do \|i\|
	line = file.gets
	break unless line
	bytes_read += line.bytesize
	line_count += 1
	GC.start if i % GC_FREQUENCY == 0
	end
	end

	return 0 if line_count.zero?
	avg_line_size = bytes_read.to_f / line_count
	(file_size / avg_line_size).to_i
	end

	def estimate_with_each_char(file_path)
	file_size = File.size(file_path)
	newline_count = 0
	bytes_read = 0

	File.open(file_path, 'r') do \|file\|
	char_count = 0
	file.each_char do \|char\|
	newline_count += 1 if char == "\n"
	bytes_read += 1
	char_count += 1
	break if char_count >= BLOCK_SIZE
	end
	end

	return 0 if newline_count.zero?
	(file_size.to_f / bytes_read * newline_count).to_i
	end

	def estimate_with_lazy_enum(file_path)
	file_size = File.size(file_path)
	lines_enum = File.open(file_path).each_line.lazy
	samples = lines_enum.take(SAMPLE_LINES).to_a

	bytes_read = samples.sum(&:bytesize)
	line_count = samples.size

	return 0 if line_count.zero?
	avg_line_size = bytes_read.to_f / line_count
	(file_size / avg_line_size).to_i
	end

	def benchmark_memory_usage(file_path, methods)
	puts "\nMemory Usage Benchmark:"
	Benchmark.memory do \|x\|
	methods.each do \|name, method\|
	x.report(name) { method.call(file_path) }
	end
	x.compare!
	end
	end

	def benchmark_speed(file_path, methods)
	puts "\nSpeed Benchmark:"
	Benchmark.bm(25) do \|x\|
	methods.each do \|name, method\|
	x.report(name) { method.call(file_path) }
	end
	end
	end

	def check_accuracy(file_path, methods)
	file_size = File.size(file_path)

	# Skip accuracy check for very large files
	if file_size >= 100 * 1024 * 1024
	puts "\nSkipping accuracy check for large file (#{file_size / 1024 / 1024} MB)"
	return
	end

	puts "\nAccuracy Check:"
	actual = count_actual_lines(file_path)
	puts "Actual line count: #{actual}"

	methods.each do \|name, method\|
	estimate = method.call(file_path)
	error = actual.zero? ? 0 : ((estimate - actual).abs.to_f / actual * 100).round(2)
	puts "#{name}: #{estimate} (Error: #{error}%)"
	end
	end

	generate_test_csv

	puts "Generated test CSV file with #{DEFAULT_TEST_ROWS} rows and #{TEST_FILE_COLUMNS} columns: #{TEST_FILE_NAME}"
	puts "File size: #{File.size(TEST_FILE_NAME)} bytes"

	estimation_methods = {
	"Beginning Sample" => method(:estimate_beginning_sample),
	"Multi-Point" => method(:estimate_multi_point),
	"Block-Based" => method(:estimate_block_based),
	"Byte Counting" => method(:estimate_byte_counting),
	"With GC (foreach)" => method(:estimate_with_foreach_gc),
	"Each Char (limited)" => method(:estimate_with_each_char),
	"Lazy Enumerator" => method(:estimate_with_lazy_enum)
	}

	puts "\nQuick Estimates:"
	estimation_methods.each do \|name, method\|
	puts "#{name}: #{method.call(TEST_FILE_NAME)} rows"
	end

	benchmark_memory_usage(TEST_FILE_NAME, estimation_methods)
	benchmark_speed(TEST_FILE_NAME, estimation_methods)
	check_accuracy(TEST_FILE_NAME, estimation_methods)