Last active
December 2, 2024 10:55
-
-
Save geofflangdale/0012da44731624802606376bc680b5bd to your computer and use it in GitHub Desktop.
ARM NEON PMOVMSKB substitute to turn 4 predicate results over 128-bits to a single 64-bit value
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { | |
const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, | |
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; | |
uint8x16_t t0 = vandq_u8(p0, bitmask); | |
uint8x16_t t1 = vandq_u8(p1, bitmask); | |
uint8x16_t t2 = vandq_u8(p2, bitmask); | |
uint8x16_t t3 = vandq_u8(p3, bitmask); | |
uint8x16_t sum0 = vpaddq_u8(t0, t1); | |
uint8x16_t sum1 = vpaddq_u8(t2, t3); | |
sum0 = vpaddq_u8(sum0, sum1); | |
sum0 = vpaddq_u8(sum0, sum0); | |
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment