Created
August 5, 2011 17:22
-
-
Save gpakosz/1128030 to your computer and use it in GitHub Desktop.
ARM NEON integer 16x8 dot product
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------------------ | |
# int32_t dotProduct_16x8_neon(int16_t const* __restrict u, int8_t const* __restrict v, int32_t size) | |
.globl _dotProduct_16x8_neon | |
.private_extern _dotProduct_16x8_neon | |
.no_dead_strip _dotProduct_16x8_neon | |
_dotProduct_16x8_neon: | |
# calling conventions: | |
#--------------------- | |
# r0 = u | |
# r1 = v | |
# r2 = size | |
# | |
# used neon registers: | |
# -------------------- | |
# q0 = accumulator | |
# q1 (d2, d3) = u (u, u + 4) | |
# q2 (d4, d5) = v | |
# q3 (d6, d7) = u + 8 | |
# q4 (d8, d9) = v + 8 | |
pld [r0] | |
pld [r1] | |
asr r2, r2, #4 | |
subs r2, r2, #1 | |
mov r3, #0 | |
vdup.32 q0, r3 | |
.loop16x8: | |
vld1.8 {d4}, [r1]! | |
vld1.16 {d2-d3}, [r0]! | |
vld1.8 {d6}, [r1]! | |
vld1.16 {d8-d9}, [r0]! | |
pld [r0, #128] | |
pld [r1, #128] | |
vmovl.s8 q2, d4 | |
vmovl.s8 q3, d6 | |
vmlal.s16 q0, d2, d4 | |
vmlal.s16 q0, d3, d5 | |
vmlal.s16 q0, d6, d8 | |
vmlal.s16 q0, d7, d9 | |
subs r2, r2, #1 | |
bpl .loop16x8 | |
vpaddl.s32 q0, q0 | |
vmov.s64 d2, d1 | |
vadd.i64 d0, d0, d2 | |
vmov.s32 r0, d0[0] | |
bx lr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment