Created
March 17, 2012 21:05
-
-
Save davidm/2065267 to your computer and use it in GitHub Desktop.
hamming weight calculation tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--[[ | |
Correctness and benchmark tests of various hamming weight implementations. | |
This is also called "popcount". | |
See also | |
http://lua-users.org/wiki/HammingWeight | |
http://stackoverflow.com/questions/109023/best-algorithm-to-count-the-number-of-set-bits-in-a-32-bit-integer | |
http://www.dalkescientific.com/writings/diary/archive/2008/07/03/hakmem_and_other_popcounts.html | |
http://perso.citi.insa-lyon.fr/claurado/ham/overview.pdf | |
http://chessprogramming.wikispaces.com/Population+Count | |
David Manura, 2012-03. | |
--]] | |
-- utility section -- | |
-- https://gist.github.com/2064991 | |
local function memoize(f) | |
local mt = {} | |
local t = setmetatable({}, mt) | |
function mt:__index(k) | |
local v = f(k); t[k] = v | |
return v | |
end | |
return t | |
end | |
-- https://gist.github.com/1414923 | |
local function requireany(...) | |
local errs = {} | |
for i = 1, select('#', ...) do local name = select(i, ...) | |
if type(name) ~= 'string' then return name, nil end | |
local ok, mod = pcall(require, name) | |
if ok then return mod, name end | |
errs[#errs+1] = mod | |
end | |
error(table.concat(errs, '\n'), 2) | |
end | |
-- testing utility | |
local OP = {} | |
OP['=='] = function(a, b) return a == b end | |
local function check(compare, a, b, name) | |
local comparef = OP[compare] | |
if not comparef(a, b) then | |
error(name..': '..compare..tostring(a)..' '..tostring(b)) | |
end | |
end | |
-- benchmarking utility. | |
-- Note: uses CPU time (clock function). | |
-- `bench` repeatedly calls `f` for at least `min_seconds` seconds (defaults to 2). | |
-- `f_count` is number of operations performed in `f` (defaults to 1). | |
-- Returns number of seconds per operation. | |
local clock = os.clock | |
local function bench(min_seconds, f, f_count) | |
min_seconds = min_seconds or 2 | |
f_count = f_count or 1 | |
local t1 = clock() | |
local ntimes = 1 | |
local count = 0 | |
while 1 do | |
for i=1,ntimes do | |
f() | |
end | |
count = count + ntimes | |
local t2 = clock() | |
if t2 - t1 >= min_seconds then | |
local period = (t2 - t1) / (count * f_count) | |
return period | |
end | |
ntimes = ntimes * 2 -- exponential increase | |
end | |
end | |
local function dump_bench(name, seconds) | |
local ts = {} | |
for i=1,#seconds do | |
ts[i] = ('%0.1E'):format(seconds[i]) | |
end | |
print(('%10s: %s s'):format(name, table.concat(ts, ' '))) | |
end | |
-- main code section -- | |
local bit = requireany('bit', 'bit32') | |
local rshift = bit.rshift | |
local band = bit.band | |
local extract8 = bit.extract or function(n, field, _) | |
return band(rshift(n, field), 0xFF) | |
end -- https://github.com/davidm/lua-bit-numberlua | |
-- Hamming weight of 32-bit integer. | |
-- Simple (naive) implementation. | |
local function hw_simple(x) | |
local sum = 0 | |
while x ~= 0 do | |
sum = sum + band(x, 1) | |
x = rshift(x, 1) | |
end | |
return sum | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation uses divide-and-conquer (tree), 32-bit version of popcount_2: | |
-- http://en.wikipedia.org/wiki/Hamming_weight . | |
-- Note: The modulo product (x * h01) in popcount_3 could overflow floating point. | |
local MASK1 = 0x55555555 -- repeating 01 pattern | |
local MASK2 = 0x33333333 -- repeating 0011 pattern | |
local MASK4 = 0x0F0F0F0F -- repeating 00001111 pattern | |
local function hw_dc2bit(x) | |
x = x - band(rshift(x, 1), MASK1) -- 2 bit sums | |
x = band(x, MASK2) + band(rshift(x, 2), MASK2) -- 4 bit sums | |
x = band(x + rshift(x, 4), MASK4) -- 8 bit sums | |
return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF) -- sum of bytes | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation uses divide-and-conguer (tree), 3-bit variant of hw_dc2bit. | |
-- Implementation provided by Egor Skriptunoff. | |
-- A similar implementation is on http://stackoverflow.com/questions/109023/ . | |
-- Based on HAKMEM #169: | |
-- http://home.pipeline.com/~hbaker1/hakmem/hacks.html#item169 . | |
local MASK001 = 0x49249249 -- repeating 001 pattern | |
local MASK011 = 0xDB6DB6DB -- repeating 011 pattern | |
local MASK3 = 0xC71C71C7 -- repeating 000111 pattern | |
local function hw_dc3bit(x) | |
x = x - band(rshift(x, 1), MASK011) | |
- band(rshift(x, 2), MASK001) -- 3 bit sums | |
x = band(x + rshift(x, 3), MASK3) -- 6 bit sums | |
x = x + rshift(x, 6) -- 12 bit sums | |
return band(x + rshift(x, 12) + rshift(x, 24), 0x3F) | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation uses divide-and-conquer (tree), similar to hw_dc3bit. | |
-- Based on "Intel" solution in http://wiki.cs.pdx.edu/forge/popcount.html . | |
local m1 = 0x55555555 -- repeating 01 pattern | |
local m2 = 0xC30C30C3 -- repeating 000011 pattern | |
local function hw_dci(x) | |
x = x - band(rshift(x, 1), m1) | |
x = band(x, m2) + band(rshift(x, 2), m2) + band(rshift(x, 4), m2) | |
x = x + rshift(x, 6) | |
return band(x + rshift(x, 12) + rshift(x, 24), 0x3f) | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation uses divide-and-conquer (tree), 4-bit variant of hw_dc3bit. | |
-- Based on | |
-- http://www.necessaryandsufficient.net/2009/04/optimising-bit-counting-using-iterative-data-driven-development/ | |
-- but without the (i * 0x01010101)>>24 trick. | |
local function hw_dc4bit(x) | |
x = x - band(rshift(x, 1), 0x77777777) | |
- band(rshift(x, 2), 0x33333333) | |
- band(rshift(x, 3), 0x11111111) | |
x = band(x + rshift(x, 4), 0xF0F0F0F) | |
return band(x + rshift(x, 8) + rshift(x, 16) + rshift(x, 24), 0xFF) | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation based on 8-bit lookup table (LUT), memoized. | |
local HW = memoize(hw_simple) | |
local function hw_lut8(x) | |
local n0 = band(x, 255); local x = rshift(x, 8) | |
local n1 = band(x, 255); local x = rshift(x, 8) | |
local n2 = band(x, 255); local x = rshift(x, 8) | |
local n3 = x | |
return HW[n0] + HW[n1] + HW[n2] + HW[n3] | |
end | |
--[[ inferior on LuaJit | |
local function hw_lut8(x) | |
local n0 = extract8(x, 0, 8) | |
local n1 = extract8(x, 8, 8) | |
local n2 = extract8(x, 16, 8) | |
local n3 = extract8(x, 24, 8) | |
return HW[n0] + HW[n1] + HW[n2] + HW[n3] | |
end | |
--]] | |
-- Hamming weight of 32-bit integer. | |
-- Implementation based on 8-bit lookup table (LUT), memoized, without bitops | |
local HW = memoize(hw_simple) | |
local function hw_lut8a(x) | |
local n0 = x % 256; local x = (x - n0) / 256 | |
local n1 = x % 256; local x = (x - n1) / 256 | |
local n2 = x % 256; local x = (x - n2) / 256 | |
local n3 = x | |
return HW[n0] + HW[n1] + HW[n2] + HW[n3] | |
end | |
-- Hamming weight of 32-bit integer. | |
-- Implementation based on | |
-- Peter Wegner 1960 - http://dx.doi.org/10.1145/367236.367286 | |
-- (also referenced in K&R 1988). | |
local function hw_wegner(x) | |
local sum = 0 | |
while x ~= 0 do | |
x = band(x, x-1) | |
sum = sum + 1 | |
end | |
return sum | |
end | |
local function test_hw_value(f, x) | |
check('==', hw_simple(x), f(x), x) | |
end | |
local function test_hw(f) | |
for x=0,1E+3 do | |
test_hw_value(f, x) | |
test_hw_value(f, 0xFFFFFFFF - x) | |
test_hw_value(f, math.random(2^31-1)) | |
end | |
end | |
-- note: test both high and low numbers to avoid bias in Wegner version. | |
local function make_test_func(f) | |
local M = 0xFFFFFFFF | |
local count = 0; for _=0,1E+3,7 do count = count + 10 end | |
return function() | |
local result = 0 -- guard against unused computations optimizing away. | |
for i=0,1E+3,7 do | |
result = result + f(i) + f(i) + f(i) + f(i) + f(i) | |
local i = M-i | |
result = result + f(i) + f(i) + f(i) + f(i) + f(i) | |
end | |
return result | |
end, count | |
end | |
local results = {} | |
local function bench_hw(f, name) | |
results[name] = results[name] or {} | |
table.insert(results[name], bench(2.0, make_test_func(f))) | |
dump_bench(name, results[name]) | |
end | |
-- test correctness | |
test_hw(hw_dc2bit) | |
test_hw(hw_dc3bit) | |
test_hw(hw_dc4bit) | |
test_hw(hw_dci) | |
test_hw(hw_lut8) | |
test_hw(hw_lut8a) | |
test_hw(hw_wegner) | |
-- test performance | |
for i=1,3 do | |
bench_hw(hw_simple, 'hw_simple') | |
bench_hw(hw_dc2bit, 'hw_dc2bit') | |
bench_hw(hw_dc3bit, 'hw_dc3bit') | |
bench_hw(hw_dc4bit, 'hw_dc4bit') | |
bench_hw(hw_dci, 'hw_dci') | |
bench_hw(hw_lut8, 'hw_lut8') | |
bench_hw(hw_lut8a, 'hw_lut8a') | |
bench_hw(hw_wegner, 'hw_wegner') | |
print '---' | |
end | |
print 'DONE' | |
--[[ | |
(c) 2012 David Manura. | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
THE SOFTWARE. | |
[Note: individual hw_* function implementations minus comments are considered | |
public domain, but citation in source is recommended.] | |
--]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment