Skip to content

Instantly share code, notes, and snippets.

@rygorous
Last active February 24, 2026 03:16
Show Gist options
  • Select an option

  • Save rygorous/563e09067e203ac329d26419eaffd996 to your computer and use it in GitHub Desktop.

Select an option

Save rygorous/563e09067e203ac329d26419eaffd996 to your computer and use it in GitHub Desktop.
Generate all 3-input ternary logic ops on bits via ARM instructions. (SHA3 ext ops optional)
0x00 0 cost=10
0x01 BCAX(A, 0xf5, 0x0c) cost=30
0x02 BIC(0x0a, B) cost=20
0x03 EOR(A, 0xf3) cost=20
0x04 BIC(B, 0xfa) cost=20
0x05 EOR(A, 0xf5) cost=20
0x06 BCAX(0x0a, B, A) cost=20
0x07 EOR(A, 0xf7) cost=30
0x08 BCAX(B, B, 0x0a) cost=20
0x09 EOR3(A, 0x0a, 0xf3) cost=30
0x0a BIC(C, A) cost=10
0x0b BIC(0xbb, A) cost=20
0x0c BIC(B, A) cost=10
0x0d BIC(0xdd, A) cost=20
0x0e BIC(0xee, A) cost=20
0x0f NOT(A) cost=10
0x10 BIC(A, 0xee) cost=20
0x11 EOR(B, 0xdd) cost=20
0x12 BCAX(0x22, A, B) cost=20
0x13 BCAX(0x22, 0xf5, B) cost=30
0x14 BIC(0x3c, C) cost=20
0x15 BCAX(0x44, 0xf3, C) cost=30
0x16 EOR3(A, C, 0x4c) cost=30
0x17 EOR(0xaf, 0xb8) cost=31
0x18 BCAX(0x0a, 0x5a, B) cost=30
0x19 BCAX(B, 0xdd, 0x0a) cost=30
0x1a EOR3(A, C, 0x40) cost=30
0x1b NOT(0xe4) cost=21
0x1c BCAX(B, A, 0x22) cost=20
0x1d NOT(0xe2) cost=21
0x1e EOR3(B, A, 0x22) cost=20
0x1f EOR(A, 0xef) cost=30
0x20 BCAX(A, A, 0x22) cost=20
0x21 EOR3(A, 0x22, 0xf3) cost=30
0x22 BIC(C, B) cost=10
0x23 BIC(0xaf, B) cost=20
0x24 BCAX(0x0c, C, 0x96) cost=30
0x25 BCAX(A, 0xf5, 0x22) cost=30
0x26 BCAX(0x0c, C, 0xc0) cost=30
0x27 NOT(0xd8) cost=21
0x28 BCAX(0x0a, C, B) cost=20
0x29 BCAX(0x0a, 0xaf, B) cost=30
0x2a BSL(B, 0x0a, C) cost=21
0x2b BCAX(0xaf, B, 0x5a) cost=30
0x2c BSL(A, 0x22, B) cost=21
0x2d EOR(A, 0xdd) cost=20
0x2e BCAX(0x0c, C, B) cost=20
0x2f ORN(0x22, A) cost=20
0x30 BIC(A, B) cost=10
0x31 BIC(0xf5, B) cost=20
0x32 BIC(0xfa, B) cost=20
0x33 NOT(B) cost=10
0x34 BCAX(A, B, 0x0a) cost=20
0x35 NOT(0xca) cost=21
0x36 EOR3(B, A, 0x0a) cost=20
0x37 EOR(A, 0xc7) cost=30
0x38 BSL(B, 0x0a, A) cost=21
0x39 BCAX(0x33, C, A) cost=20
0x3a BCAX(0x0a, A, B) cost=20
0x3b ORN(0x0a, B) cost=20
0x3c EOR(B, A) cost=10
0x3d BCAX(A, 0xdd, 0x30) cost=30
0x3e BCAX(A, 0xee, 0x30) cost=30
0x3f EOR(A, 0xcf) cost=20
0x40 BIC(B, 0xaf) cost=20
0x41 EOR3(B, 0xaf, 0x22) cost=30
0x42 BCAX(0x0a, B, 0x96) cost=30
0x43 BCAX(A, 0xbb, 0x0c) cost=30
0x44 BIC(B, C) cost=10
0x45 BIC(0xcf, C) cost=20
0x46 BCAX(B, 0x9a, 0x55) cost=30
0x47 NOT(0xb8) cost=21
0x48 BCAX(B, B, 0x5a) cost=20
0x49 BCAX(B, 0xdd, 0x5a) cost=30
0x4a BSL(A, 0x44, C) cost=21
0x4b EOR(A, 0xbb) cost=20
0x4c BIC(B, 0xa0) cost=20
0x4d BCAX(0xcf, C, 0x3c) cost=30
0x4e BCAX(0x0a, B, C) cost=20
0x4f ORN(0x44, A) cost=20
0x50 BIC(A, C) cost=10
0x51 BIC(0xf3, C) cost=20
0x52 BCAX(A, C, 0x0c) cost=20
0x53 NOT(0xac) cost=21
0x54 BIC(0xfc, C) cost=20
0x55 NOT(C) cost=10
0x56 EOR(B, 0x9a) cost=20
0x57 BCAX(0x55, C, 0xfc) cost=30
0x58 BCAX(B, 0x96, 0x0a) cost=30
0x59 BCAX(0x55, B, A) cost=20
0x5a EOR(C, A) cost=10
0x5b BCAX(A, 0xaf, 0x44) cost=30
0x5c BCAX(B, A, 0x66) cost=20
0x5d ORN(0x0c, C) cost=20
0x5e BCAX(B, 0x96, 0x44) cost=30
0x5f EOR(A, 0xaf) cost=20
0x60 BCAX(A, A, 0x66) cost=20
0x61 BCAX(A, 0xf5, 0x66) cost=30
0x62 BCAX(A, 0x96, 0x44) cost=30
0x63 EOR(B, 0xaf) cost=20
0x64 BCAX(A, 0x96, 0x0a) cost=30
0x65 BCAX(0x55, A, B) cost=20
0x66 EOR(C, B) cost=10
0x67 BCAX(B, 0xaf, 0x44) cost=30
0x68 BCAX(B, 0xee, 0x5a) cost=30
0x69 EOR3(B, A, 0x55) cost=20
0x6a EOR(B, 0xa6) cost=20
0x6b ORN(0x0a, 0x96) cost=30
0x6c BCAX(B, A, 0x55) cost=20
0x6d BCAX(B, 0xbb, 0x5a) cost=30
0x6e BCAX(B, 0xa6, 0x55) cost=30
0x6f ORN(0x66, A) cost=20
0x70 BIC(A, 0x88) cost=20
0x71 BCAX(0xf3, C, 0x3c) cost=30
0x72 BCAX(A, C, 0x3c) cost=20
0x73 ORN(0x50, B) cost=20
0x74 BCAX(A, B, 0x5a) cost=20
0x75 ORN(0x30, C) cost=20
0x76 BCAX(B, 0xfa, 0x44) cost=30
0x77 EOR(B, 0xbb) cost=20
0x78 EOR3(B, 0x44, A) cost=20
0x79 BCAX(A, 0xaf, 0x66) cost=30
0x7a BCAX(A, C, 0x30) cost=20
0x7b ORN(0x5a, B) cost=20
0x7c BCAX(A, B, 0x50) cost=30
0x7d ORN(0x3c, C) cost=20
0x7e ORR(0x3c, 0x5a) cost=30
0x7f BCAX(A, 0xaf, 0x30) cost=30
0x80 BCAX(B, B, 0xa0) cost=20
0x81 BCAX(0x8b, C, A) cost=30
0x82 BCAX(0x88, C, A) cost=20
0x83 BCAX(A, 0xf3, 0x88) cost=30
0x84 BCAX(B, B, 0x96) cost=20
0x85 BCAX(B, 0xdd, 0x96) cost=30
0x86 EOR3(A, C, 0xdc) cost=30
0x87 EOR3(B, A, 0xbb) cost=20
0x88 AND(C, B) cost=10
0x89 BCAX(B, 0x55, 0x9a) cost=30
0x8a AND(0x9a, C) cost=20
0x8b BCAX(0xbb, A, B) cost=20
0x8c BCAX(B, A, 0xbb) cost=20
0x8d BCAX(0xaf, C, B) cost=20
0x8e BSL(0x3c, B, C) cost=21
0x8f ORN(0x88, A) cost=20
0x90 BCAX(A, A, 0x96) cost=20
0x91 BCAX(B, 0xdd, 0xa0) cost=30
0x92 EOR3(A, C, 0xc8) cost=30
0x93 EOR3(B, A, 0xaf) cost=20
0x94 BCAX(B, 0x5a, 0x22) cost=30
0x95 EOR3(A, C, 0xcf) cost=20
0x96 EOR3(C, A, B) cost=10
0x97 EOR3(A, C, 0xcd) cost=30
0x98 BSL(B, C, 0xd8) cost=22
0x99 EOR3(C, 1, B) cost=20
0x9a BCAX(C, A, B) cost=10
0x9b EOR3(A, C, 0xc1) cost=30
0x9c BCAX(B, A, C) cost=10
0x9d BCAX(B, 0x55, 0xa6) cost=30
0x9e BCAX(B, 0x5a, 0x88) cost=30
0x9f ORN(0x96, A) cost=20
0xa0 AND(C, A) cost=10
0xa1 BCAX(A, 0xf3, C) cost=20
0xa2 AND(0xa6, C) cost=20
0xa3 BCAX(0xaf, B, A) cost=20
0xa4 BCAX(A, 0xfc, C) cost=20
0xa5 EOR3(C, 1, A) cost=20
0xa6 BCAX(C, B, A) cost=10
0xa7 EOR3(A, C, 0xfd) cost=30
0xa8 AND(0xfc, C) cost=20
0xa9 EOR3(A, C, 0xf3) cost=20
0xaa C cost=0
0xab BSL(B, C, 0xaf) cost=21
0xac BSL(A, C, B) cost=11
0xad BSL(A, C, 0xdd) cost=21
0xae ORR(0x0c, C) cost=20
0xaf ORN(C, A) cost=10
0xb0 BCAX(A, A, 0xbb) cost=20
0xb1 BCAX(0xbb, C, A) cost=20
0xb2 BSL(0x3c, A, C) cost=21
0xb3 ORN(0xa0, B) cost=20
0xb4 BCAX(A, B, C) cost=10
0xb5 BCAX(A, 0xcf, C) cost=20
0xb6 EOR3(A, C, 0xec) cost=30
0xb7 ORN(0x96, B) cost=20
0xb8 BSL(B, C, A) cost=11
0xb9 BCAX(A, 0xdd, 0x96) cost=30
0xba ORR(0x30, C) cost=20
0xbb ORN(C, B) cost=10
0xbc BCAX(B, A, 0x88) cost=20
0xbd ORN(0x3c, 0x5a) cost=30
0xbe ORR(0x3c, C) cost=20
0xbf ORN(0x9c, B) cost=20
0xc0 AND(B, A) cost=10
0xc1 BCAX(B, 0xdd, A) cost=20
0xc2 BCAX(B, 0xee, A) cost=20
0xc3 EOR3(B, 1, A) cost=20
0xc4 BCAX(B, C, 0xa6) cost=20
0xc5 BCAX(0xcf, C, A) cost=20
0xc6 BCAX(B, C, A) cost=10
0xc7 BCAX(B, 0xbb, A) cost=20
0xc8 BCAX(B, B, 0xfa) cost=20
0xc9 EOR3(B, C, 0xaf) cost=20
0xca BSL(A, B, C) cost=11
0xcb BSL(A, B, 0xbb) cost=21
0xcc B cost=0
0xcd ORN(B, 0xfa) cost=20
0xce ORR(B, 0x0a) cost=20
0xcf ORN(B, A) cost=10
0xd0 BCAX(A, A, 0xdd) cost=20
0xd1 BCAX(0xdd, B, A) cost=20
0xd2 BCAX(A, C, B) cost=10
0xd3 BCAX(A, 0xaf, B) cost=20
0xd4 BSL(0x5a, A, B) cost=21
0xd5 ORN(0xc0, C) cost=20
0xd6 BCAX(B, 0x5a, 0x44) cost=30
0xd7 ORN(0x96, C) cost=20
0xd8 BSL(C, B, A) cost=11
0xd9 BCAX(A, 0xaf, 0x96) cost=30
0xda BCAX(A, C, 0xc0) cost=20
0xdb ORN(0x5a, 0x66) cost=30
0xdc BCAX(B, A, 0xee) cost=20
0xdd ORN(B, C) cost=10
0xde ORR(B, 0x5a) cost=20
0xdf ORN(B, 0xa0) cost=20
0xe0 BCAX(A, A, 0xee) cost=20
0xe1 EOR3(B, 0xdd, A) cost=20
0xe2 BSL(B, A, C) cost=11
0xe3 BSL(B, A, 0xaf) cost=21
0xe4 BSL(C, A, B) cost=11
0xe5 BCAX(B, 0xaf, 0x96) cost=30
0xe6 BSL(B, 0xe4, C) cost=22
0xe7 ORN(0x66, 0x5a) cost=30
0xe8 BSL(0x3c, C, A) cost=21
0xe9 BCAX(B, 0xaf, 0x9a) cost=30
0xea ORR(0xc0, C) cost=20
0xeb BCAX(0xaf, B, C) cost=20
0xec BCAX(B, C, 0x9a) cost=20
0xed ORN(B, 0x5a) cost=20
0xee ORR(C, B) cost=10
0xef ORR(B, 0xaf) cost=20
0xf0 A cost=0
0xf1 ORN(A, 0xee) cost=20
0xf2 BCAX(A, C, 0xfc) cost=20
0xf3 ORN(A, B) cost=10
0xf4 BCAX(A, B, 0xfa) cost=20
0xf5 ORN(A, C) cost=10
0xf6 ORR(A, 0x66) cost=20
0xf7 ORN(A, 0x88) cost=20
0xf8 BCAX(A, B, 0xc6) cost=20
0xf9 ORN(A, 0x66) cost=20
0xfa ORR(C, A) cost=10
0xfb ORN(A, 0x44) cost=20
0xfc ORR(B, A) cost=10
0xfd ORN(B, 0x0a) cost=20
0xfe ORR(B, 0xfa) cost=20
0xff 1 cost=10
max_cost=31 avg_cost=20.9
import heapq
if __name__ == '__main__':
ops = 256 * [None]
worklist = []
filled = []
def add_seed(tt, cost, desc):
assert ops[tt] is None
ops[tt] = (cost, desc, None, None, None)
heapq.heappush(worklist, (cost, tt))
filled.append(tt)
def add_tt(tt, desc, cost, A, B=None, C=None):
if ops[tt] is not None:
if ops[tt][0] <= cost:
return
# leave existing stale entry in worklist around, we'll discover it later
else:
filled.append(tt)
ops[tt] = (cost, desc, A, B, C)
heapq.heappush(worklist, (cost, tt))
# Add seed entries
add_seed(0b0000_0000, 10, '0') # count as single mov
add_seed(0b1111_1111, 10, '1') # count as single mov
add_seed(0b1111_0000, 0, 'A') # count as free
add_seed(0b1100_1100, 0, 'B') # count as free
add_seed(0b1010_1010, 0, 'C') # count as free
SHA3_OPS = True
# Build other functions inductively
while len(worklist) > 0:
a_cost, a_tt = heapq.heappop(worklist)
# skip over stale entries with old prio
if ops[a_tt][0] < a_cost:
continue
cost1 = a_cost + 10 # we're adding a new insn
initial_filled = len(filled)
# NOT
add_tt(a_tt ^ 0xff, 'NOT', cost1, a_tt)
# fill out all binary before any ternary so they get the slot when viable
for b_tt in filled[:initial_filled]:
cost2 = cost1 + ops[b_tt][0]
add_tt(a_tt | b_tt, 'ORR', cost2, a_tt, b_tt)
add_tt(a_tt & b_tt, 'AND', cost2, a_tt, b_tt)
add_tt(a_tt | (b_tt ^ 0xff), 'ORN', cost2, a_tt, b_tt)
add_tt(a_tt & ~b_tt, 'BIC', cost2, a_tt, b_tt)
add_tt(a_tt ^ b_tt, 'EOR', cost2, a_tt, b_tt)
# ternary ops
for c_tt in filled[:initial_filled]:
cost3 = cost2 + ops[c_tt][0]
add_tt((a_tt & b_tt) | (~a_tt & c_tt), 'BSL', cost3 + 1, a_tt, b_tt, c_tt) # +1 cost penalty because BSLs have reg alloc limitations
if SHA3_OPS:
add_tt(a_tt ^ b_tt ^ c_tt, 'EOR3', cost3, a_tt, b_tt, c_tt)
add_tt(a_tt ^ (b_tt & ~c_tt), 'BCAX', cost3, a_tt, b_tt, c_tt)
max_cost = 0
sum_costs = 0
def format_op(A):
if A is None:
return ''
if ops[A] is not None:
desc = ops[A][1]
if len(desc) == 1:
return desc
return f'0x{A:02x}'
for i, entry in enumerate(ops):
if entry is None:
continue
cost, desc, A, B, C = entry
Aop = format_op(A)
Bop = format_op(B)
Cop = format_op(C)
if C is not None:
opdesc = f'{desc}({Aop}, {Bop}, {Cop})'
elif B is not None:
opdesc = f'{desc}({Aop}, {Bop})'
elif A is not None:
opdesc = f'{desc}({Aop})'
else:
opdesc = desc
print(f'0x{i:02x} {opdesc} cost={cost}')
sum_costs += cost
max_cost = max(max_cost, cost)
print(f'max_cost={max_cost} avg_cost={sum_costs/256:.1f}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment