Skip to content

Instantly share code, notes, and snippets.

@straight-shoota
Last active July 31, 2017 18:48
Show Gist options
  • Save straight-shoota/219eb51e741bb9c57849683ae623dd36 to your computer and use it in GitHub Desktop.
Save straight-shoota/219eb51e741bb9c57849683ae623dd36 to your computer and use it in GitHub Desktop.
Faster Command Line Tools in Crystal

This tries to solve the challenge of a fast parser and aggregator for a delimter-separated data file in Crystal.

Here are some benchmarks on my fairly old laptop with Ubuntu on Linux (so total speed is quite low) and not comparable with the results from the linked benchmarks:

                             Naive   0.14  (  7.37s ) (± 6.05%)  1.58× slower
                            Simple   0.12  (  8.05s ) (± 5.71%)  1.73× slower
                         IntParser    0.2  (  5.06s ) (± 4.63%)  1.09× slower
                             Array   0.21  (  4.83s ) (± 6.63%)  1.04× slower
                   ArrayWithBackup   0.21  (  4.66s ) (± 1.50%)       fastest
          Naive with cached stream   0.13  (  7.62s ) (± 4.11%)  1.64× slower
         Simple with cached stream   0.14  (  7.08s ) (± 1.51%)  1.52× slower
      IntParser with cached stream   0.19  (  5.35s ) (±10.71%)  1.15× slower
          Array with cached stream    0.2  (  4.98s ) (± 7.27%)  1.07× slower
ArrayWithBackup with cached stream    0.2  (  4.94s ) (± 7.10%)  1.06× slower

Running Crystal IntParser against V4: Stop using strings from the Go implementation yields the following results on my machine:

$ time ./crystal_int ngrams.tsv 1 2
max_key: 2006 sum: 22569013

real    0m6.689s
user    0m5.016s
sys     0m1.031s

$ time ./go_v4 ngrams.tsv 1 2
max_key: 2006 sum: 22569013

real    0m9.679s
user    0m8.016s
sys     0m1.438s
require "./faster"
file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2
File.open(file_path) do |file|
result = {% if flag?("array_with_backup") %}Faster::ArrayWithBackup{% else %}Faster::IntParser{% end %}.process_data(file, key_index, value_index)
puts "max_key: #{result[0]} sum: #{result[1]}"
end
require "./faster"
require "benchmark"
file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2
IMPLEMENTATIONS = %w(Naive Simple IntParser Array ArrayWithBackup)
file = File.open(file_path)
Benchmark.ips(25, 0) do |x|
{% for implementation in IMPLEMENTATIONS %}
x.report({{ implementation }}) do
File.open(file_path) do |file|
Faster::{{implementation.id}}.process_data(file, key_index, value_index)
end
end
{% end %}
{% for implementation in IMPLEMENTATIONS %}
x.report("{{ implementation.id }} with cached stream") do
file.rewind
Faster::{{implementation.id}}.process_data(file, key_index, value_index)
end
{% end %}
end
at_exit do
file.close
end
require "./faster"
require "benchmark"
file_path = ARGV[0]? || "./ngrams.tsv"
key_index = ARGV[1]?.try(&.to_i) || 1
value_index = ARGV[2]?.try(&.to_i) || 2
data = IO::Memory.new
time_loading = Benchmark.measure do
File.open(file_path) do |file|
IO.copy(file, data)
data.rewind
end
end
puts "Time to load data into memory:"
puts time_loading
time_executing = Benchmark.measure do
result = {% if flag?("array_with_backup") %}Faster::ArrayWithBackup{% else %}Faster::IntParser{% end %}.process_data(data, key_index, value_index)
puts "max_key: #{result[0]} sum: #{result[1]}"
end
puts
puts "Time to process data:"
puts time_executing
module Faster
DELIMITER = '\t'
NL = '\n'
module Naive
def self.process_lines(io, key_index, value_index)
io.each_line do |line|
fields = line.split(DELIMITER)
yield fields[key_index], fields[value_index]
end
end
def self.process_data(io, key_index, value_index)
sums = {} of Int32 => Int32
process_lines(io, key_index, value_index) do |key, value|
key = key.to_i
sums[key] = sums.fetch(key, 0) + value.to_i
end
sums.max_by &.[](1)
end
end
module Simple
def self.process_lines(io, key_index, value_index)
field_index = 0
key = IO::Memory.new
value = IO::Memory.new
io.each_char do |c|
case c
when NL
yield key.to_s, value.to_s
key.clear
value.clear
field_index = 0
when DELIMITER
field_index += 1
else
case field_index
when key_index
key << c
when value_index
value << c
end
end
end
end
def self.process_data(io, key_index, value_index)
sums = {} of Int32 => Int32
process_lines(io, key_index, value_index) do |key, value|
key = key.to_i
sums[key] = sums.fetch(key, 0) + value.to_i
end
sums.max_by &.[](1)
end
end
module IntParser
def self.process_lines(io, key_index, value_index)
field_index = 0
key = 0
value = 0
io.each_char do |c|
case c
when '\n'
yield key, value
key = 0
value = 0
field_index = 0
when DELIMITER
field_index += 1
else
case field_index
when key_index
key = key * 10 + c.to_i
when value_index
value = value * 10 + c.to_i
end
end
end
end
def self.process_data(io, key_index, value_index)
sums = {} of Int32 => Int32
process_lines(io, key_index, value_index) do |key, value|
sums[key] = sums.fetch(key, 0) + value
end
sums.max_by &.[](1)
end
end
module Array
def self.process_lines(io, key_index, value_index)
field_index = 0
key = 0
value = 0
io.each_char do |c|
case c
when '\n'
yield key, value
key = 0
value = 0
field_index = 0
when DELIMITER
field_index += 1
else
case field_index
when key_index
key = key * 10 + c.to_i
when value_index
value = value * 10 + c.to_i
end
end
end
end
def self.process_data(io, key_index, value_index)
sums = ::Array(Int32).new(4096, 0)
process_lines(io, key_index, value_index) do |key, value|
sums[key] += value
end
max = {-1, 0}
sums.each_with_index do |value, key|
max = {key, value} if value > max[1]
end
max
end
end
module ArrayWithBackup
ARRAY_SIZE = 4096
def self.process_lines(io, key_index, value_index)
field_index = 0
key = 0
value = 0
io.each_char do |c|
case c
when '\n'
yield key, value
key = 0
value = 0
field_index = 0
when DELIMITER
field_index += 1
else
case field_index
when key_index
key = key * 10 + c.to_i
when value_index
value = value * 10 + c.to_i
end
end
end
end
def self.process_data(io, key_index, value_index)
sums = ::Array(Int32).new(ARRAY_SIZE, 0)
sums_hash = {} of Int32 => Int32
process_lines(io, key_index, value_index) do |key, value|
if(key >= ARRAY_SIZE)
sums_hash[key] = sums_hash.fetch(key, 0) + value
else
sums[key] += value
end
end
max = {-1, 0}
max = sums_hash.max_by &.[](1) unless sums_hash.empty?
sums.each_with_index do |value, key|
max = {key, value} if value > max[1]
end
max
end
end
end
data: ngrams.tsv
ngrams.tsv:
curl https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-0.gz | gunzip > ngrams.tsv
benchmark: data
crystal cli_bench.cr --release --no-debug -- ngrams.tsv 1 2
build: bin/crystal_nuts bin/crystal_int bin/crystal_array_with_backup bin/crystal_bench
.PHONY: bin/crystal_bench
bin/crystal_bench: bin
crystal build cli_bench.cr --release --no-debug -o $@
.PHONY: bin/crystal_array_with_backup
bin/crystal_array_with_backup: bin
crystal build cli.cr --release --no-debug -o $@ -Darray_with_backup
.PHONY: bin/crystal_int
bin/crystal_int: bin
crystal build cli.cr --release --no-debug -o $@
.PHONY: bin/crystal_nuts
bin/crystal_nuts: bin
crystal build cli_nuts.cr --release --no-debug -o $@
bin:
mkdir bin
clean:
rm ngrams.tsv
rm -r bin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment