Created
May 4, 2025 13:23
-
-
Save peterc/43d86f9ed41d41ec50b3f97e96c60691 to your computer and use it in GitHub Desktop.
A memory efficient count-min style compact approximator in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'zlib' | |
require 'objspace' | |
# A variant of https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch | |
class CompactApproximator | |
attr_accessor :cells | |
def initialize(size, hashes, min_elem, join_fn, meet_fn) | |
@m, @k = size, hashes | |
@cells = Array.new(@k){ Array.new(@m, min_elem) } | |
@join, @meet = join_fn, meet_fn | |
end | |
def add(key, value) | |
@k.times do |i| | |
j = (Zlib.crc32("#{i}-#{key}") % @m) | |
@cells[i][j] = @join.call(@cells[i][j], value) | |
end | |
end | |
def query(key) | |
@k.times | |
.map { |i| @cells[i][Zlib.crc32("#{i}-#{key}") % @m] } | |
.reduce { |a,b| @meet.call(a,b) } | |
end | |
end | |
sample_words = %w[the and to of a in that is it oliver said funeral kwyjibo] | |
# accumulate counts, then take min for query | |
sketch = CompactApproximator.new( | |
2000, 2, | |
0, | |
->(a,b){ a + b }, # join function = addition | |
->(a,b){ a < b ? a : b } # meet function = min value | |
) | |
exact = Hash.new(0) | |
# Read the file of Oliver Twist - grab this from wherever | |
# or some other large text | |
file = 'oliver.txt' | |
File.foreach(file) do |line| | |
line.downcase.scan(/\w+/).each do |w| | |
sketch.add(w, 1) | |
exact[w] += 1 | |
end | |
end | |
puts "Word exact est error" | |
puts "---- ----- --- -----" | |
sample_words.each do |w| | |
e = exact[w] | |
q = sketch.query(w) | |
puts "%-7s %6d %6d %6d" % [w, e, q, q - e] | |
end | |
cells_overhead = ObjectSpace.memsize_of(sketch.cells) + | |
sketch.cells.sum { |row| | |
ObjectSpace.memsize_of(row) + | |
row.sum { |elem| ObjectSpace.memsize_of(elem) } | |
} | |
puts "Memory usage:" | |
puts " Exact hash: #{ObjectSpace.memsize_of(exact)} bytes" | |
puts " @cells matrix: #{cells_overhead} bytes" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment