davidm · March 17, 2012 21:05
diff --git a/hamming_weight_test.lua b/hamming_weight_test.lua
 --[[
 Correctness and benchmark tests of various hamming weight implementations.
 This is also called "popcount".
 See also
   http://lua-users.org/wiki/HammingWeight
   http://stackoverflow.com/questions/109023/best-algorithm-to-count-the-number-of-set-bits-in-a-32-bit-integer
   http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html
   http://perso.citi.insa-lyon.fr/claurado/ham/overview.pdf
   http://chessprogramming.wikispaces.com/Population+Count
 David Manura, 2012-03.
 --]]

 -- utility section --

 -- https://gist.github.com/2064991
 local function memoize(f)
  local mt = {}
  local t = setmetatable({}, mt)
  function mt:__index(k)
    local v = f(k); t[k] = v
    return v
  end
  return t
 end

 -- https://gist.github.com/1414923
 local function requireany(...)
  local errs = {}
  for i = 1, select('#', ...) do local name = select(i, ...)
    if type(name) ~= 'string' then return name, nil end
    local ok, mod = pcall(require, name)
    if ok then return mod, name end
    errs[#errs+1] = mod
  end
  error(table.concat(errs, '\n'), 2)
 end

 -- testing utility
 local OP = {}
 OP['=='] = function(a, b) return a == b end
 local function check(compare, a, b, name)
  local comparef = OP[compare]
  if not comparef(a, b) then
    error(name..': '..compare..tostring(a)..' '..tostring(b))
  end
 end

 -- benchmarking utility.
 -- Note: uses CPU time (clock function).
 -- `bench` repeatedly calls `f` for at least `min_seconds` seconds (defaults to 2).
 --   `f_count` is number of operations performed in `f` (defaults to 1).
 --   Returns number of seconds per operation.
 local clock = os.clock
 local function bench(min_seconds, f, f_count)
  min_seconds = min_seconds or 2
  f_count = f_count or 1
  local t1 = clock()
  local ntimes = 1
  local count = 0
  while 1 do
    for i=1,ntimes do
      f()
    end
    count = count + ntimes
    local t2 = clock()
    if t2 - t1 >= min_seconds then
      local period = (t2 - t1) / (count * f_count)
      return period
    end
    ntimes = ntimes * 2  -- exponential increase
  end
 end
 local function dump_bench(name, seconds)
  local ts = {}
  for i=1,#seconds do
    ts[i] = ('%0.1E'):format(seconds[i])
  end
  print(('%10s: %s s'):format(name, table.concat(ts, ' ')))
 end


 -- main code section --

 local bit = requireany('bit', 'bit32')
 local rshift = bit.rshift
 local band = bit.band
 local extract8 = bit.extract or function(n, field, _)
  return band(rshift(n, field), 0xFF)
 end  -- https://github.com/davidm/lua-bit-numberlua

 -- Hamming weight of 32-bit integer.
 -- Simple (naive) implementation.
 local function hw_simple(x)
  local sum = 0
  while x ~= 0 do
    sum = sum + band(x, 1)
    x = rshift(x, 1)
  end
  return sum
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation uses divide-and-conquer (tree), 32-bit version of popcount_2:
 -- http://en.wikipedia.org/wiki/Hamming_weight .
 -- Note: The modulo product (x * h01) in popcount_3 could overflow floating point.
 local MASK1 = 0x55555555  -- repeating 01 pattern
 local MASK2 = 0x33333333  -- repeating 0011 pattern
 local MASK4 = 0x0F0F0F0F  -- repeating 00001111 pattern
 local function hw_dc2bit(x)
  x = x - band(rshift(x, 1), MASK1)              -- 2 bit sums
  x = band(x, MASK2) + band(rshift(x, 2), MASK2) -- 4 bit sums
  x = band(x + rshift(x, 4), MASK4)              -- 8 bit sums
  return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF) -- sum of bytes
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation uses divide-and-conguer (tree), 3-bit variant of hw_dc2bit.
 -- Implementation provided by Egor Skriptunoff.
 -- A similar implementation is on http://stackoverflow.com/questions/109023/ .
 -- Based on HAKMEM #169:
 --   http://home.pipeline.com/~hbaker1/hakmem/hacks.html#item169 .
 local MASK001 = 0x49249249 -- repeating 001 pattern
 local MASK011 = 0xDB6DB6DB -- repeating 011 pattern
 local MASK3   = 0xC71C71C7 -- repeating 000111 pattern
 local function hw_dc3bit(x)
  x = x - band(rshift(x, 1), MASK011)
        - band(rshift(x, 2), MASK001) -- 3 bit sums
  x = band(x + rshift(x, 3), MASK3)   -- 6 bit sums
  x = x + rshift(x, 6)                -- 12 bit sums
  return band(x + rshift(x, 12) + rshift(x, 24), 0x3F)
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation uses divide-and-conquer (tree), similar to hw_dc3bit.
 -- Based on "Intel" solution in http://wiki.cs.pdx.edu/forge/popcount.html .
 local m1 = 0x55555555 -- repeating 01 pattern
 local m2 = 0xC30C30C3 -- repeating 000011 pattern
 local function hw_dci(x)
  x = x - band(rshift(x, 1), m1)
  x = band(x, m2) + band(rshift(x, 2), m2) + band(rshift(x, 4), m2)
  x = x + rshift(x, 6)
  return band(x + rshift(x, 12) + rshift(x, 24), 0x3f)
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation uses divide-and-conquer (tree), 4-bit variant of hw_dc3bit.
 -- Based on
 -- http://www.necessaryandsufficient.net/2009/04/optimising-bit-counting-using-iterative-data-driven-development/
 -- but without the (i * 0x01010101)>>24 trick.
 local function hw_dc4bit(x)
  x = x - band(rshift(x, 1), 0x77777777)
        - band(rshift(x, 2), 0x33333333)
        - band(rshift(x, 3), 0x11111111)
  x = band(x + rshift(x, 4), 0xF0F0F0F)
  return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF)
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation based on 8-bit lookup table (LUT), memoized.
 local HW = memoize(hw_simple)
 local function hw_lut8(x)
  local n0 = band(x, 255); local x = rshift(x, 8)
  local n1 = band(x, 255); local x = rshift(x, 8)
  local n2 = band(x, 255); local x = rshift(x, 8)
  local n3 = x
  return HW[n0] + HW[n1] + HW[n2] + HW[n3]
 end
 --[[ inferior on LuaJit
 local function hw_lut8(x)
  local n0 = extract8(x, 0, 8)
  local n1 = extract8(x, 8, 8)
  local n2 = extract8(x, 16, 8)
  local n3 = extract8(x, 24, 8)
  return HW[n0] + HW[n1] + HW[n2] + HW[n3]
 end
 --]]

 -- Hamming weight of 32-bit integer.
 -- Implementation based on 8-bit lookup table (LUT), memoized, without bitops
 local HW = memoize(hw_simple)
 local function hw_lut8a(x)
  local n0 = x % 256; local x = (x - n0) / 256
  local n1 = x % 256; local x = (x - n1) / 256
  local n2 = x % 256; local x = (x - n2) / 256
  local n3 = x
  return HW[n0] + HW[n1] + HW[n2] + HW[n3]
 end

 -- Hamming weight of 32-bit integer.
 -- Implementation based on
 -- Peter Wegner 1960 - http://dx.doi.org/10.1145/367236.367286
 -- (also referenced in K&R 1988).
 local function hw_wegner(x)
  local sum = 0
  while x ~= 0 do
    x = band(x, x-1)
    sum = sum + 1
  end
  return sum
 end

 local function test_hw_value(f, x)
  check('==', hw_simple(x), f(x), x)
 end

 local function test_hw(f)
  for x=0,1E+3 do
    test_hw_value(f, x)
    test_hw_value(f, 0xFFFFFFFF - x)
    test_hw_value(f, math.random(2^31-1))
  end
 end

 -- note: test both high and low numbers to avoid bias in Wegner version.
 local function make_test_func(f)
  local M = 0xFFFFFFFF
  local count = 0; for _=0,1E+3,7 do count = count + 10 end
  return function()
    local result = 0 -- guard against unused computations optimizing away.
    for i=0,1E+3,7 do
      result = result + f(i) + f(i) + f(i) + f(i) + f(i)
      local i = M-i
      result = result + f(i) + f(i) + f(i) + f(i) + f(i)
    end
    return result
  end, count
 end

 local results = {}
 local function bench_hw(f, name)
  results[name] = results[name] or {}
  table.insert(results[name], bench(2.0, make_test_func(f)))
  dump_bench(name, results[name])
 end

 -- test correctness
 test_hw(hw_dc2bit)
 test_hw(hw_dc3bit)
 test_hw(hw_dc4bit)
 test_hw(hw_dci)
 test_hw(hw_lut8)
 test_hw(hw_lut8a)
 test_hw(hw_wegner)


 -- test performance
 for i=1,3 do
  bench_hw(hw_simple,  'hw_simple')
  bench_hw(hw_dc2bit,  'hw_dc2bit')
  bench_hw(hw_dc3bit,  'hw_dc3bit')
  bench_hw(hw_dc4bit,  'hw_dc4bit')
  bench_hw(hw_dci,     'hw_dci')
  bench_hw(hw_lut8,    'hw_lut8')
  bench_hw(hw_lut8a,   'hw_lut8a')
  bench_hw(hw_wegner,  'hw_wegner')
  print '---'
 end

 print 'DONE'


 --[[
 (c) 2012 David Manura.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.

 [Note: individual hw_* function implementations minus comments are considered
 public domain, but citation in source is recommended.]
 --]]
	--[[
	Correctness and benchmark tests of various hamming weight implementations.
	This is also called "popcount".
	See also
	http://lua-users.org/wiki/HammingWeight
	http://stackoverflow.com/questions/109023/best-algorithm-to-count-the-number-of-set-bits-in-a-32-bit-integer
	http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html
	http://perso.citi.insa-lyon.fr/claurado/ham/overview.pdf
	http://chessprogramming.wikispaces.com/Population+Count
	David Manura, 2012-03.
	--]]

	-- utility section --

	-- https://gist.github.com/2064991
	local function memoize(f)
	local mt = {}
	local t = setmetatable({}, mt)
	function mt:__index(k)
	local v = f(k); t[k] = v
	return v
	end
	return t
	end

	-- https://gist.github.com/1414923
	local function requireany(...)
	local errs = {}
	for i = 1, select('#', ...) do local name = select(i, ...)
	if type(name) ~= 'string' then return name, nil end
	local ok, mod = pcall(require, name)
	if ok then return mod, name end
	errs[#errs+1] = mod
	end
	error(table.concat(errs, '\n'), 2)
	end

	-- testing utility
	local OP = {}
	OP['=='] = function(a, b) return a == b end
	local function check(compare, a, b, name)
	local comparef = OP[compare]
	if not comparef(a, b) then
	error(name..': '..compare..tostring(a)..' '..tostring(b))
	end
	end

	-- benchmarking utility.
	-- Note: uses CPU time (clock function).
	-- `bench` repeatedly calls `f` for at least `min_seconds` seconds (defaults to 2).
	-- `f_count` is number of operations performed in `f` (defaults to 1).
	-- Returns number of seconds per operation.
	local clock = os.clock
	local function bench(min_seconds, f, f_count)
	min_seconds = min_seconds or 2
	f_count = f_count or 1
	local t1 = clock()
	local ntimes = 1
	local count = 0
	while 1 do
	for i=1,ntimes do
	f()
	end
	count = count + ntimes
	local t2 = clock()
	if t2 - t1 >= min_seconds then
	local period = (t2 - t1) / (count * f_count)
	return period
	end
	ntimes = ntimes * 2 -- exponential increase
	end
	end
	local function dump_bench(name, seconds)
	local ts = {}
	for i=1,#seconds do
	ts[i] = ('%0.1E'):format(seconds[i])
	end
	print(('%10s: %s s'):format(name, table.concat(ts, ' ')))
	end


	-- main code section --

	local bit = requireany('bit', 'bit32')
	local rshift = bit.rshift
	local band = bit.band
	local extract8 = bit.extract or function(n, field, _)
	return band(rshift(n, field), 0xFF)
	end -- https://github.com/davidm/lua-bit-numberlua

	-- Hamming weight of 32-bit integer.
	-- Simple (naive) implementation.
	local function hw_simple(x)
	local sum = 0
	while x ~= 0 do
	sum = sum + band(x, 1)
	x = rshift(x, 1)
	end
	return sum
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation uses divide-and-conquer (tree), 32-bit version of popcount_2:
	-- http://en.wikipedia.org/wiki/Hamming_weight .
	-- Note: The modulo product (x * h01) in popcount_3 could overflow floating point.
	local MASK1 = 0x55555555 -- repeating 01 pattern
	local MASK2 = 0x33333333 -- repeating 0011 pattern
	local MASK4 = 0x0F0F0F0F -- repeating 00001111 pattern
	local function hw_dc2bit(x)
	x = x - band(rshift(x, 1), MASK1) -- 2 bit sums
	x = band(x, MASK2) + band(rshift(x, 2), MASK2) -- 4 bit sums
	x = band(x + rshift(x, 4), MASK4) -- 8 bit sums
	return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF) -- sum of bytes
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation uses divide-and-conguer (tree), 3-bit variant of hw_dc2bit.
	-- Implementation provided by Egor Skriptunoff.
	-- A similar implementation is on http://stackoverflow.com/questions/109023/ .
	-- Based on HAKMEM #169:
	-- http://home.pipeline.com/~hbaker1/hakmem/hacks.html#item169 .
	local MASK001 = 0x49249249 -- repeating 001 pattern
	local MASK011 = 0xDB6DB6DB -- repeating 011 pattern
	local MASK3 = 0xC71C71C7 -- repeating 000111 pattern
	local function hw_dc3bit(x)
	x = x - band(rshift(x, 1), MASK011)
	- band(rshift(x, 2), MASK001) -- 3 bit sums
	x = band(x + rshift(x, 3), MASK3) -- 6 bit sums
	x = x + rshift(x, 6) -- 12 bit sums
	return band(x + rshift(x, 12) + rshift(x, 24), 0x3F)
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation uses divide-and-conquer (tree), similar to hw_dc3bit.
	-- Based on "Intel" solution in http://wiki.cs.pdx.edu/forge/popcount.html .
	local m1 = 0x55555555 -- repeating 01 pattern
	local m2 = 0xC30C30C3 -- repeating 000011 pattern
	local function hw_dci(x)
	x = x - band(rshift(x, 1), m1)
	x = band(x, m2) + band(rshift(x, 2), m2) + band(rshift(x, 4), m2)
	x = x + rshift(x, 6)
	return band(x + rshift(x, 12) + rshift(x, 24), 0x3f)
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation uses divide-and-conquer (tree), 4-bit variant of hw_dc3bit.
	-- Based on
	-- http://www.necessaryandsufficient.net/2009/04/optimising-bit-counting-using-iterative-data-driven-development/
	-- but without the (i * 0x01010101)>>24 trick.
	local function hw_dc4bit(x)
	x = x - band(rshift(x, 1), 0x77777777)
	- band(rshift(x, 2), 0x33333333)
	- band(rshift(x, 3), 0x11111111)
	x = band(x + rshift(x, 4), 0xF0F0F0F)
	return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF)
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation based on 8-bit lookup table (LUT), memoized.
	local HW = memoize(hw_simple)
	local function hw_lut8(x)
	local n0 = band(x, 255); local x = rshift(x, 8)
	local n1 = band(x, 255); local x = rshift(x, 8)
	local n2 = band(x, 255); local x = rshift(x, 8)
	local n3 = x
	return HW[n0] + HW[n1] + HW[n2] + HW[n3]
	end
	--[[ inferior on LuaJit
	local function hw_lut8(x)
	local n0 = extract8(x, 0, 8)
	local n1 = extract8(x, 8, 8)
	local n2 = extract8(x, 16, 8)
	local n3 = extract8(x, 24, 8)
	return HW[n0] + HW[n1] + HW[n2] + HW[n3]
	end
	--]]

	-- Hamming weight of 32-bit integer.
	-- Implementation based on 8-bit lookup table (LUT), memoized, without bitops
	local HW = memoize(hw_simple)
	local function hw_lut8a(x)
	local n0 = x % 256; local x = (x - n0) / 256
	local n1 = x % 256; local x = (x - n1) / 256
	local n2 = x % 256; local x = (x - n2) / 256
	local n3 = x
	return HW[n0] + HW[n1] + HW[n2] + HW[n3]
	end

	-- Hamming weight of 32-bit integer.
	-- Implementation based on
	-- Peter Wegner 1960 - http://dx.doi.org/10.1145/367236.367286
	-- (also referenced in K&R 1988).
	local function hw_wegner(x)
	local sum = 0
	while x ~= 0 do
	x = band(x, x-1)
	sum = sum + 1
	end
	return sum
	end

	local function test_hw_value(f, x)
	check('==', hw_simple(x), f(x), x)
	end

	local function test_hw(f)
	for x=0,1E+3 do
	test_hw_value(f, x)
	test_hw_value(f, 0xFFFFFFFF - x)
	test_hw_value(f, math.random(2^31-1))
	end
	end

	-- note: test both high and low numbers to avoid bias in Wegner version.
	local function make_test_func(f)
	local M = 0xFFFFFFFF
	local count = 0; for _=0,1E+3,7 do count = count + 10 end
	return function()
	local result = 0 -- guard against unused computations optimizing away.
	for i=0,1E+3,7 do
	result = result + f(i) + f(i) + f(i) + f(i) + f(i)
	local i = M-i
	result = result + f(i) + f(i) + f(i) + f(i) + f(i)
	end
	return result
	end, count
	end

	local results = {}
	local function bench_hw(f, name)
	results[name] = results[name] or {}
	table.insert(results[name], bench(2.0, make_test_func(f)))
	dump_bench(name, results[name])
	end

	-- test correctness
	test_hw(hw_dc2bit)
	test_hw(hw_dc3bit)
	test_hw(hw_dc4bit)
	test_hw(hw_dci)
	test_hw(hw_lut8)
	test_hw(hw_lut8a)
	test_hw(hw_wegner)


	-- test performance
	for i=1,3 do
	bench_hw(hw_simple, 'hw_simple')
	bench_hw(hw_dc2bit, 'hw_dc2bit')
	bench_hw(hw_dc3bit, 'hw_dc3bit')
	bench_hw(hw_dc4bit, 'hw_dc4bit')
	bench_hw(hw_dci, 'hw_dci')
	bench_hw(hw_lut8, 'hw_lut8')
	bench_hw(hw_lut8a, 'hw_lut8a')
	bench_hw(hw_wegner, 'hw_wegner')
	print '---'
	end

	print 'DONE'


	--[[
	(c) 2012 David Manura.

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in
	all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	THE SOFTWARE.

	[Note: individual hw_* function implementations minus comments are considered
	public domain, but citation in source is recommended.]
	--]]