Created
June 30, 2012 13:10
-
-
Save syohex/3023686 to your computer and use it in GitHub Desktop.
sample code of generation ARM NEON instruction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
arm-linux-gnueabi-gcc-4.6 -O2 -march=armv7-a -mtune=cortex-a9 -ftree-vectorize -mhard-float -mfloat-abi=softfp -mfpu=neon -ffast-math -mvectorize-with-neon-quad -S neon_test.c | |
*/ | |
void NeonTest(short int * __restrict a, short int * __restrict b, short int * __restrict z) | |
{ | |
int i; | |
for (i = 0; i < 200; i++) { | |
z[i] = a[i] * b[i]; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.syntax unified | |
.arch armv7-a | |
.eabi_attribute 27, 3 | |
.fpu neon | |
.eabi_attribute 23, 1 | |
.eabi_attribute 24, 1 | |
.eabi_attribute 25, 1 | |
.eabi_attribute 26, 2 | |
.eabi_attribute 30, 2 | |
.eabi_attribute 34, 1 | |
.eabi_attribute 18, 4 | |
.thumb | |
.file "neon_test2.c" | |
.text | |
.align 2 | |
.global NeonTest | |
.thumb | |
.thumb_func | |
.type NeonTest, %function | |
NeonTest: | |
@ args = 0, pretend = 0, frame = 8 | |
@ frame_needed = 0, uses_anonymous_args = 0 | |
@ link register save eliminated. | |
ubfx r3, r0, #1, #3 | |
push {r4, r5, r6, r7, r8, r9, sl, fp} | |
negs r3, r3 | |
sub sp, sp, #8 | |
ands r3, r3, #7 | |
beq .L8 | |
movs r6, #0 | |
mov r4, r6 | |
.L3: | |
ldrh r7, [r1, r6] | |
adds r4, r4, #1 | |
ldrh ip, [r0, r6] | |
rsb r5, r4, #200 | |
mul r7, ip, r7 | |
strh r7, [r2, r6] @ movhi | |
adds r6, r6, #2 | |
cmp r3, r4 | |
bhi .L3 | |
.L2: | |
rsb r6, r3, #200 | |
lsr r8, r6, #3 | |
str r6, [sp, #4] | |
lsl fp, r8, #3 | |
cmp r8, #0 | |
beq .L4 | |
add r3, r0, r3, lsl #1 | |
movs r6, #0 | |
rsb sl, r0, r1 | |
rsb r9, r0, r2 | |
mov ip, r4 | |
.L5: | |
add r7, r3, sl | |
vldmia r3, {d16-d17} | |
adds r6, r6, #1 | |
add r4, r3, r9 | |
vld1.16 {q9}, [r7] | |
adds r3, r3, #16 | |
vmul.i16 q8, q9, q8 @ <= Neon instruction | |
cmp r6, r8 | |
vst1.16 {q8}, [r4] | |
bcc .L5 | |
ldr r3, [sp, #4] | |
add r4, ip, fp | |
rsb r5, fp, r5 | |
cmp r3, fp | |
beq .L1 | |
.L4: | |
lsls r4, r4, #1 | |
movs r3, #0 | |
adds r2, r2, r4 | |
adds r1, r1, r4 | |
adds r0, r0, r4 | |
.L7: | |
ldrh r4, [r1, r3] | |
ldrh r6, [r0, r3] | |
mul r4, r6, r4 | |
strh r4, [r2, r3] @ movhi | |
adds r3, r3, #2 | |
subs r5, r5, #1 | |
bne .L7 | |
.L1: | |
add sp, sp, #8 | |
pop {r4, r5, r6, r7, r8, r9, sl, fp} | |
bx lr | |
.L8: | |
movs r5, #200 | |
mov r4, r3 | |
b .L2 | |
.size NeonTest, .-NeonTest | |
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3" | |
.section .note.GNU-stack,"",%progbits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment