Created
August 26, 2017 14:43
-
-
Save kathawala/ec90a52f084a6e3cb9c9157c269afe79 to your computer and use it in GitHub Desktop.
Fully works for all rounds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TODO: Finish this up, test speed vs GPU vs emscripten transpiled C->asm.js | |
// Currently looking at the keccak function, which is given as input: | |
// input - in our case the "caveat emptor" string | |
// inlen - length of input = 13 | |
// md - the buffer of 200 bytes we want to fill up | |
// stlen - length of uint64_t array with 25 elements = 200 bytes | |
// ( aka: output length we desire ) | |
// Reading through the code there is: | |
// a temp buffer of 144 bytes | |
// a st buffer of 200 bytes | |
// rsiz = 136 | |
// rsizw = 136 / 8 = 17 | |
// rsiz and rsizw are "rate" and "rateInBytes" in the Keccak reference, but "rate" | |
// in Keccak reference is 1088, thus "rateInBytes" should be 136...? | |
// There is a loop which is sometimes skipped, but for now we will ignore it, | |
// it seems to move along the input and do some computations until it is only | |
// looking at input where input is lt rsiz (rsiz == 136) | |
// This loop seems only to be triggered when the input is gte rsiz (rsiz == 136) | |
// Afterwards we copy the input (or what's left of it) to the temp buffer | |
// Then we add a 1 to the end of what's been copied and pad the temp buffer up | |
// to rsiz size (i.e. until it is 136 bytes) with 0s | |
// Then we bitwise OR the last byte of rsiz (i.e. temp[rsiz-1]) with 0x80 | |
// Then in a for loop for rsizw (rsizw == 17) rounds we do this | |
// in every iteration | |
// for every 8 bytes of st, | |
// we set those 8 bytes to bitwise XOR of those 8 bytes with 8 bytes from temp | |
// Now we call keccakf on st with the number of Keccak Rounds set at 24 | |
// Then we memcpy st into md and return (voila! keccak hash!) | |
// the keccakf function is given the following inputs: | |
// st - a buffer of 8 byte values with 25 values in it | |
// rounds - number of rounds (in our case 24) | |
// we then initialize the following vars | |
// t - an 8 byte value | |
// bc - a 5 value array of 8 byte values | |
// Thus, for each round there are 4 steps. I will leave the reading of this code | |
// up to you, since it is pretty descriptive in and of itself. Only here I will | |
// note possible speed up points | |
// In the Theta step, the second loop has you calculating 5 t values and successively | |
// XORing them with different values in st. It would probably be faster to calculate | |
// the 5 t values all at once (in an unrolled sort of way) and then successively | |
// XOR all the values in st all at once. (play with some XORing to make sure) | |
// In the Rho Pi step, I think having an extra variable is more readable than | |
// bc[0] = st[j] | |
// st[j] = blah(t, ajksf) | |
// t = bc[0] | |
// In the Chi step you can definitely shave off some loops, just need to experiment | |
// with how in a scratch C program. | |
// In the Iota step you are lastly taking a constant and XORing it with st[0] | |
// Congrats! Now you can implement a keccak hash in JavaScript (although it seems | |
// to differ a bit from the reference keccak hash) | |
// New place to see a good Keccak reference (all optimizations are there): | |
// https://github.com/lucasjones/cpuminer-multi/tree/master/crypto | |
// (newer miner with ops) https://github.com/fireice-uk/xmr-stak-cpu/tree/master/crypto | |
// function keccak(input, inlen, md, mdlen) { //md, mdlen here not necessary. mdlen==200 | |
function print64bit(hi, lo) { | |
hi = hi.toString(2); | |
lo = lo.toString(2); | |
if (hi.length < 32) { | |
var padding = 32 - hi.length; | |
for (var i = 0; i < padding; i++) { | |
hi = "0" + hi; | |
} | |
} | |
if (lo.length < 32) { | |
var padding = 32 - lo.length; | |
for (var i = 0; i < padding; i++) { | |
lo = "0" + lo; | |
} | |
} | |
console.log(hi + " " + lo); | |
} | |
module.exports = { | |
keccak: keccak | |
} | |
function keccak(input, inlen) { | |
const KECCAK_ROUNDS = 24; | |
const HASH_DATA_AREA = 136; | |
var state_buffer = new ArrayBuffer(200); | |
var st = new Uint32Array(state_buffer); | |
// original code runs on 64-bit data, since JS can only handle 32 | |
// we divide by 4 (bytes) instead of 8 (bytes) as in original code | |
var rsizw = HASH_DATA_AREA/4; | |
// for loop which gets skipped on small inputs!!! | |
var end = 0; | |
console.log(inlen); | |
var special_epochs = Math.floor(inlen / HASH_DATA_AREA); | |
if (special_epochs > 0) { | |
inlen = inlen % HASH_DATA_AREA; | |
for (var j = 0; j < special_epochs; ++j) { | |
var start = j*HASH_DATA_AREA; | |
end = start+HASH_DATA_AREA; | |
var temp = new Uint8Array(144); | |
temp.set(input.subarray(start, start+HASH_DATA_AREA)); | |
var ttemp = new Uint32Array(temp.buffer); | |
for (i=0; i<rsizw; ++i) { | |
st[i] ^= ttemp[i]; | |
} | |
keccakf(st, KECCAK_ROUNDS); | |
} | |
} | |
var temp = new Uint8Array(144); | |
temp.set(input.subarray(end,)); | |
temp[inlen++] = 1; | |
temp[HASH_DATA_AREA-1] |= 0x80; | |
var ttemp = new Uint32Array(temp.buffer); | |
// var ttemp = new Uint32Array(temp.buffer); | |
for (i=0; i<rsizw; ++i) { | |
if (i%2 == 1) { | |
print64bit(ttemp[i], ttemp[i-1]); | |
} | |
st[i] ^= ttemp[i]; | |
} | |
keccakf(st, KECCAK_ROUNDS); | |
return state_buffer; | |
} | |
function keccakf(st, rounds) { | |
// const keccakf_piln = [10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, | |
// 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 ]; | |
// const keccakf_rotc = [1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, | |
// 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44]; | |
const keccakf_rndc = [0x00000000, 0x00000001, 0x00000000, 0x00008082, 0x80000000, 0x0000808a, | |
0x80000000, 0x80008000, 0x00000000, 0x0000808b, 0x00000000, 0x80000001, | |
0x80000000, 0x80008081, 0x80000000, 0x00008009, 0x00000000, 0x0000008a, | |
0x00000000, 0x00000088, 0x00000000, 0x80008009, 0x00000000, 0x8000000a, | |
0x00000000, 0x8000808b, 0x80000000, 0x0000008b, 0x80000000, 0x00008089, | |
0x80000000, 0x00008003, 0x80000000, 0x00008002, 0x80000000, 0x00000080, | |
0x00000000, 0x0000800a, 0x80000000, 0x8000000a, 0x80000000, 0x80008081, | |
0x80000000, 0x00008080, 0x00000000, 0x80000001, 0x80000000, 0x80008008]; | |
// remember that st is a 32 bit array, NOT 64 like in the orig code | |
var t = new Uint32Array(2); | |
var bc = new Uint32Array(10); | |
for (var round=0; round<rounds; ++round) { | |
// Theta | |
bc[0] = st[0] ^ st[10] ^ st[20] ^ st[30] ^ st[40]; | |
bc[1] = st[1] ^ st[11] ^ st[21] ^ st[31] ^ st[41]; | |
bc[2] = st[2] ^ st[12] ^ st[22] ^ st[32] ^ st[42]; | |
bc[3] = st[3] ^ st[13] ^ st[23] ^ st[33] ^ st[43]; | |
bc[4] = st[4] ^ st[14] ^ st[24] ^ st[34] ^ st[44]; | |
bc[5] = st[5] ^ st[15] ^ st[25] ^ st[35] ^ st[45]; | |
bc[6] = st[6] ^ st[16] ^ st[26] ^ st[36] ^ st[46]; | |
bc[7] = st[7] ^ st[17] ^ st[27] ^ st[37] ^ st[47]; | |
bc[8] = st[8] ^ st[18] ^ st[28] ^ st[38] ^ st[48]; | |
bc[9] = st[9] ^ st[19] ^ st[29] ^ st[39] ^ st[49]; | |
for (var i=0; i<10; i+=2) { // must unroll loop as modulo 10 is expensive on GPU | |
var hi = bc[(i+3) % 10]; | |
var lo = bc[(i+2) % 10]; | |
t[0] = bc[(i+8) % 10] ^ rotl64_lo(hi, lo, 1); | |
t[1] = bc[(i+9) % 10] ^ rotl64_hi(hi, lo, 1); | |
st[i] ^= t[0]; | |
st[i+1] ^= t[1]; | |
st[i+10] ^= t[0]; | |
st[i+11] ^= t[1]; | |
st[i+20] ^= t[0]; | |
st[i+21] ^= t[1]; | |
st[i+30] ^= t[0]; | |
st[i+31] ^= t[1]; | |
st[i+40] ^= t[0]; | |
st[i+41] ^= t[1]; | |
} | |
// Rho Pi | |
t[0] = st[2]; | |
t[1] = st[3]; | |
st[ 2] = rotl64_lo(st[13], st[12], 44); | |
st[ 3] = rotl64_hi(st[13], st[12], 44); | |
st[12] = rotl64_lo(st[19], st[18], 20); | |
st[13] = rotl64_hi(st[19], st[18], 20); | |
st[18] = rotl64_lo(st[45], st[44], 61); | |
st[19] = rotl64_hi(st[45], st[44], 61); | |
st[44] = rotl64_lo(st[29], st[28], 39); | |
st[45] = rotl64_hi(st[29], st[28], 39); | |
st[28] = rotl64_lo(st[41], st[40], 18); | |
st[29] = rotl64_hi(st[41], st[40], 18); | |
st[40] = rotl64_lo(st[ 5], st[ 4], 62); | |
st[41] = rotl64_hi(st[ 5], st[ 4], 62); | |
st[ 4] = rotl64_lo(st[25], st[24], 43); | |
st[ 5] = rotl64_hi(st[25], st[24], 43); | |
st[24] = rotl64_lo(st[27], st[26], 25); | |
st[25] = rotl64_hi(st[27], st[26], 25); | |
st[26] = rotl64_lo(st[39], st[38], 8); | |
st[27] = rotl64_hi(st[39], st[38], 8); | |
st[38] = rotl64_lo(st[47], st[46], 56); | |
st[39] = rotl64_hi(st[47], st[46], 56); | |
st[46] = rotl64_lo(st[31], st[30], 41); | |
st[47] = rotl64_hi(st[31], st[30], 41); | |
st[30] = rotl64_lo(st[ 9], st[ 8], 27); | |
st[31] = rotl64_hi(st[ 9], st[ 8], 27); | |
st[ 8] = rotl64_lo(st[49], st[48], 14); | |
st[ 9] = rotl64_hi(st[49], st[48], 14); | |
st[48] = rotl64_lo(st[43], st[42], 2); | |
st[49] = rotl64_hi(st[43], st[42], 2); | |
st[42] = rotl64_lo(st[17], st[16], 55); | |
st[43] = rotl64_hi(st[17], st[16], 55); | |
st[16] = rotl64_lo(st[33], st[32], 45); | |
st[17] = rotl64_hi(st[33], st[32], 45); | |
st[32] = rotl64_lo(st[11], st[10], 36); | |
st[33] = rotl64_hi(st[11], st[10], 36); | |
st[10] = rotl64_lo(st[ 7], st[ 6], 28); | |
st[11] = rotl64_hi(st[ 7], st[ 6], 28); | |
st[ 6] = rotl64_lo(st[37], st[36], 21); | |
st[ 7] = rotl64_hi(st[37], st[36], 21); | |
st[36] = rotl64_lo(st[35], st[34], 15); | |
st[37] = rotl64_hi(st[35], st[34], 15); | |
st[34] = rotl64_lo(st[23], st[22], 10); | |
st[35] = rotl64_hi(st[23], st[22], 10); | |
st[22] = rotl64_lo(st[15], st[14], 6); | |
st[23] = rotl64_hi(st[15], st[14], 6); | |
st[14] = rotl64_lo(st[21], st[20], 3); | |
st[15] = rotl64_hi(st[21], st[20], 3); | |
st[20] = rotl64_lo(t[1], t[0], 1); | |
st[21] = rotl64_hi(t[1], t[0], 1); | |
// Chi | |
// Unrolling of the following loop (last iteration sets bc[0] thru bc[9]) | |
// for (i=0; i<40; i+=10) { | |
// bc[0] = st[i]; | |
// bc[1] = st[i+1]; | |
// bc[2] = st[i+2]; | |
// bc[3] = st[i+3]; | |
// st[i ] ^= (~st[i+2]) & st[i+4]; | |
// st[i+1] ^= (~st[i+3]) & st[i+5]; | |
// st[i+2] ^= (~st[i+4]) & st[i+6]; | |
// st[i+3] ^= (~st[i+5]) & st[i+7]; | |
// st[i+4] ^= (~st[i+6]) & st[i+8]; | |
// st[i+5] ^= (~st[i+7]) & st[i+9]; | |
// st[i+6] ^= (~st[i+8]) & bc[0]; | |
// st[i+7] ^= (~st[i+9]) & bc[1]; | |
// st[i+8] ^= (~bc[0]) & bc[2]; | |
// st[i+9] ^= (~bc[1]) & bc[3]; | |
// } | |
i = 0; | |
bc[0] = st[i]; | |
bc[1] = st[i+1]; | |
bc[2] = st[i+2]; | |
bc[3] = st[i+3]; | |
st[i ] ^= (~st[i+2]) & st[i+4]; | |
st[i+1] ^= (~st[i+3]) & st[i+5]; | |
st[i+2] ^= (~st[i+4]) & st[i+6]; | |
st[i+3] ^= (~st[i+5]) & st[i+7]; | |
st[i+4] ^= (~st[i+6]) & st[i+8]; | |
st[i+5] ^= (~st[i+7]) & st[i+9]; | |
st[i+6] ^= (~st[i+8]) & bc[0]; | |
st[i+7] ^= (~st[i+9]) & bc[1]; | |
st[i+8] ^= (~bc[0]) & bc[2]; | |
st[i+9] ^= (~bc[1]) & bc[3]; | |
i = 10; | |
bc[0] = st[i]; | |
bc[1] = st[i+1]; | |
bc[2] = st[i+2]; | |
bc[3] = st[i+3]; | |
st[i ] ^= (~st[i+2]) & st[i+4]; | |
st[i+1] ^= (~st[i+3]) & st[i+5]; | |
st[i+2] ^= (~st[i+4]) & st[i+6]; | |
st[i+3] ^= (~st[i+5]) & st[i+7]; | |
st[i+4] ^= (~st[i+6]) & st[i+8]; | |
st[i+5] ^= (~st[i+7]) & st[i+9]; | |
st[i+6] ^= (~st[i+8]) & bc[0]; | |
st[i+7] ^= (~st[i+9]) & bc[1]; | |
st[i+8] ^= (~bc[0]) & bc[2]; | |
st[i+9] ^= (~bc[1]) & bc[3]; | |
i = 20; | |
bc[0] = st[i]; | |
bc[1] = st[i+1]; | |
bc[2] = st[i+2]; | |
bc[3] = st[i+3]; | |
st[i ] ^= (~st[i+2]) & st[i+4]; | |
st[i+1] ^= (~st[i+3]) & st[i+5]; | |
st[i+2] ^= (~st[i+4]) & st[i+6]; | |
st[i+3] ^= (~st[i+5]) & st[i+7]; | |
st[i+4] ^= (~st[i+6]) & st[i+8]; | |
st[i+5] ^= (~st[i+7]) & st[i+9]; | |
st[i+6] ^= (~st[i+8]) & bc[0]; | |
st[i+7] ^= (~st[i+9]) & bc[1]; | |
st[i+8] ^= (~bc[0]) & bc[2]; | |
st[i+9] ^= (~bc[1]) & bc[3]; | |
i = 30; | |
bc[0] = st[i]; | |
bc[1] = st[i+1]; | |
bc[2] = st[i+2]; | |
bc[3] = st[i+3]; | |
st[i ] ^= (~st[i+2]) & st[i+4]; | |
st[i+1] ^= (~st[i+3]) & st[i+5]; | |
st[i+2] ^= (~st[i+4]) & st[i+6]; | |
st[i+3] ^= (~st[i+5]) & st[i+7]; | |
st[i+4] ^= (~st[i+6]) & st[i+8]; | |
st[i+5] ^= (~st[i+7]) & st[i+9]; | |
st[i+6] ^= (~st[i+8]) & bc[0]; | |
st[i+7] ^= (~st[i+9]) & bc[1]; | |
st[i+8] ^= (~bc[0]) & bc[2]; | |
st[i+9] ^= (~bc[1]) & bc[3]; | |
i = 40; | |
bc[0] = st[i]; | |
bc[1] = st[i+1]; | |
bc[2] = st[i+2]; | |
bc[3] = st[i+3]; | |
bc[4] = st[i+4]; | |
bc[5] = st[i+5]; | |
bc[6] = st[i+6]; | |
bc[7] = st[i+7]; | |
bc[8] = st[i+8]; | |
bc[9] = st[i+9]; | |
st[i ] ^= (~st[i+2]) & st[i+4]; | |
st[i+1] ^= (~st[i+3]) & st[i+5]; | |
st[i+2] ^= (~st[i+4]) & st[i+6]; | |
st[i+3] ^= (~st[i+5]) & st[i+7]; | |
st[i+4] ^= (~st[i+6]) & st[i+8]; | |
st[i+5] ^= (~st[i+7]) & st[i+9]; | |
st[i+6] ^= (~st[i+8]) & bc[0]; | |
st[i+7] ^= (~st[i+9]) & bc[1]; | |
st[i+8] ^= (~bc[0]) & bc[2]; | |
st[i+9] ^= (~bc[1]) & bc[3]; | |
// Iota | |
var round_index = round*2; | |
st[0] ^= keccakf_rndc[round_index+1]; | |
st[1] ^= keccakf_rndc[round_index]; | |
} | |
return st; | |
} | |
// function to rotate two 32 bit ints (as if they are one 64 bit int) | |
// n is never 0, so we don't check for that case | |
// Because of Javascript treating all bitwise operations as done on | |
// *signed* 32-bit integers, we need to sprinkle in many ">>> 0" ops | |
// throughout the code and we also have to always shift by ">>>" not ">>" | |
// see here: https://stackoverflow.com/questions/6798111/bitwise-operations-on-32-bit-unsigned-ints | |
function rotl64_hi(hi, lo, offset) { | |
if (offset & 0x20) { // if offset >= 32 we switch lo and hi | |
lo ^= hi; | |
hi ^= lo; | |
lo ^= hi; | |
} | |
return (((hi << offset) >>> 0) | (lo >>> (32 - offset))) >>> 0; | |
} | |
function rotl64_lo(hi, lo, offset) { | |
if (offset & 0x20) { // if offset >= 32 we switch lo and hi | |
lo ^= hi; | |
hi ^= lo; | |
lo ^= hi; | |
} | |
return (((lo << offset) >>> 0) | (hi >>> (32 - offset))) >>> 0; | |
} | |
// i think some of the convenience functions from | |
// https://gist.github.com/devi/27a8bfcf77eb765540d2 | |
// would be a good place to look to make this easier to implement |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment