Last active
December 18, 2015 23:28
-
-
Save n-west/5861300 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static inline void volk_arm_32f_x2_add_32f_a_inlineneon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) { | |
unsigned int number; | |
const unsigned int quarterPoints = num_points / 4; | |
float* cPtr = cVector; | |
const float* aPtr = aVector; | |
const float* bPtr= bVector; | |
for(number=0; number < quarterPoints; number++){ | |
__asm__ volatile("vld1.32 {d0-d1}, [%[a]]!\n\t" | |
"vld1.32 {d2-d3}, [%[b]]!\n\t" | |
"vadd.f32 q1, q0, q1\n\t" | |
"vst1.32 {d2-d3}, [%[output]]!\n\t" | |
: [output] "=r"(cPtr) | |
: [a] "r"(aPtr), [b] "r"(bPtr) | |
: "memory", "d0", "d1", "d2", "d3", "d4", "d5" | |
); | |
} | |
number = quarterPoints * 4; // should be = num_points | |
for(;number < num_points; number++){ | |
*cPtr++ = (*aPtr++) + (*bPtr++); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
volk_arm_32f_x2_add_32f_a_inlineneon: | |
.LFB1897: | |
.loc 1 199 0 | |
.cfi_startproc | |
@ args = 0, pretend = 0, frame = 0 | |
@ frame_needed = 0, uses_anonymous_args = 0 | |
@ link register save eliminated. | |
.LVL22: | |
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl} | |
.LCFI2: | |
.cfi_def_cfa_offset 28 | |
.cfi_offset 4, -28 | |
.cfi_offset 5, -24 | |
.cfi_offset 6, -20 | |
.cfi_offset 7, -16 | |
.cfi_offset 8, -12 | |
.cfi_offset 9, -8 | |
.cfi_offset 10, -4 | |
.loc 1 208 0 | |
movs r7, r3, lsr #2 | |
.LVL23: | |
beq .L31 | |
mov ip, #0 | |
.LVL24: | |
.L32: | |
.loc 1 209 0 discriminator 2 | |
#APP | |
@ 209 "/home/MSS/nathan-west/code/volk/volk_arm/kernels/volk_arm/volk_arm_32f_x2_add_32f.h" 1 | |
vld1.32 {d0-d1}, [r1]! | |
vld1.32 {d2-d3}, [r2]! | |
vadd.f32 q1, q0, q1 | |
vst1.32 {d2-d3}, [r0]! | |
@ 0 "" 2 | |
.loc 1 208 0 discriminator 2 | |
add ip, ip, #1 | |
.LVL25: | |
cmp ip, r7 | |
bne .L32 | |
.LVL26: | |
.L31: | |
.loc 1 221 0 | |
mov r7, r7, asl #2 | |
.LVL27: | |
.loc 1 222 0 | |
cmp r3, r7 | |
bls .L30 | |
.loc 1 199 0 | |
add ip, r0, #16 | |
add r4, r1, #16 | |
cmp r0, r4 | |
cmpcc r1, ip | |
add r5, r2, #16 | |
movcc r4, #0 | |
movcs r4, #1 | |
cmp r0, r5 | |
cmpcc r2, ip | |
movcc ip, #0 | |
movcs ip, #1 | |
rsb r9, r7, r3 | |
and ip, r4, ip | |
mov r8, r9, lsr #2 | |
cmp r9, #3 | |
movls ip, #0 | |
andhi ip, ip, #1 | |
eor ip, ip, #1 | |
mov sl, r8, asl #2 | |
cmp r8, #0 | |
orreq ip, ip, #1 | |
cmp ip, #0 | |
bne .L42 | |
mov r6, r1 | |
mov r5, r2 | |
mov r4, r0 | |
.LVL28: | |
.L35: | |
.loc 1 223 0 discriminator 2 | |
vld1.32 {q9}, [r5]! | |
add ip, ip, #1 | |
cmp r8, ip | |
vld1.32 {q8}, [r6]! | |
vadd.f32 q8, q9, q8 | |
vst1.32 {q8}, [r4]! | |
bhi .L35 | |
.loc 1 199 0 | |
mov r5, sl, asl #2 | |
cmp r9, sl | |
add r7, r7, sl | |
add r0, r0, r5 | |
add r6, r1, r5 | |
add r5, r2, r5 | |
beq .L30 | |
.L34: | |
add r2, r7, #1 | |
.LVL29: | |
sub r8, r3, #7 | |
cmp r3, r2 | |
movcc r1, #0 | |
.LVL30: | |
movcs r1, #1 | |
cmp r2, r8 | |
movcs r1, #0 | |
cmp r3, #6 | |
movls r2, #0 | |
andhi r2, r1, #1 | |
cmp r2, #0 | |
beq .L43 | |
add ip, r0, #32 | |
add r2, r6, #32 | |
add r1, r5, #32 | |
.L38: | |
.loc 1 223 0 | |
flds s14, [r1, #-32] | |
add r0, r7, #9 | |
flds s15, [r2, #-32] | |
cmp r8, r0 | |
.loc 1 222 0 | |
add r4, r7, #8 | |
pld [r2, #48] | |
.loc 1 199 0 | |
mov r6, r2 | |
mov r5, r1 | |
mov r0, ip | |
.loc 1 222 0 | |
mov r7, r4 | |
.loc 1 223 0 | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-32] | |
flds s14, [r1, #-28] | |
flds s15, [r2, #-28] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-28] | |
flds s14, [r1, #-24] | |
flds s15, [r2, #-24] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-24] | |
flds s14, [r1, #-20] | |
flds s15, [r2, #-20] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-20] | |
flds s14, [r1, #-16] | |
flds s15, [r2, #-16] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-16] | |
flds s14, [r1, #-12] | |
flds s15, [r2, #-12] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-12] | |
flds s14, [r1, #-8] | |
flds s15, [r2, #-8] | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-8] | |
flds s14, [r1, #-4] | |
add r1, r1, #32 | |
flds s15, [r2, #-4] | |
add r2, r2, #32 | |
fadds s15, s14, s15 | |
fsts s15, [ip, #-4] | |
.LVL31: | |
add ip, ip, #32 | |
bhi .L38 | |
.LVL32: | |
.L37: | |
.loc 1 199 0 | |
sub r1, r6, #4 | |
sub r2, r5, #4 | |
.L39: | |
.loc 1 223 0 | |
add r2, r2, #4 | |
add r1, r1, #4 | |
flds s14, [r2, #0] | |
.loc 1 222 0 | |
add r4, r4, #1 | |
.loc 1 223 0 | |
add r2, r2, #4 | |
add r1, r1, #4 | |
flds s14, [r2, #0] | |
.loc 1 222 0 | |
add r4, r4, #1 | |
.loc 1 223 0 | |
flds s15, [r1, #0] | |
.loc 1 222 0 | |
cmp r3, r4 | |
.loc 1 223 0 | |
fadds s15, s14, s15 | |
fstmias r0!, {s15} | |
.LVL33: | |
.loc 1 222 0 | |
bhi .L39 | |
.LVL34: | |
.L30: | |
.loc 1 226 0 | |
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl} | |
bx lr | |
.L43: | |
.loc 1 199 0 | |
mov r4, r7 | |
b .L37 | |
.LVL35: | |
.L42: | |
.loc 1 205 0 | |
mov r5, r2 | |
.loc 1 204 0 | |
mov r6, r1 | |
b .L34 | |
.cfi_endproc | |
.LFE1897: | |
.size volk_arm_32f_x2_add_32f_a_inlineneon, .-volk_arm_32f_x2_add_32f_a_inlineneon | |
.align 2 | |
.type volk_arm_32f_x2_multiply_32f_generic, %function |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
volk_arm_32f_x2_add_32f_a_inlineneon: | |
.LFB1897: | |
.loc 1 199 0 | |
.cfi_startproc | |
@ args = 0, pretend = 0, frame = 0 | |
@ frame_needed = 0, uses_anonymous_args = 0 | |
@ link register save eliminated. | |
.LVL22: | |
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl} @, | |
.LCFI2: | |
.cfi_def_cfa_offset 28 | |
.cfi_offset 4, -28 | |
.cfi_offset 5, -24 | |
.cfi_offset 6, -20 | |
.cfi_offset 7, -16 | |
.cfi_offset 8, -12 | |
.cfi_offset 9, -8 | |
.cfi_offset 10, -4 | |
.loc 1 208 0 | |
movs r7, r3, lsr #2 @ quarterPoints, num_points, | |
.LVL23: | |
beq .L31 @, | |
mov ip, #0 @ number, | |
.LVL24: | |
.L32: | |
.loc 1 209 0 discriminator 2 | |
#APP | |
@ 209 "/home/MSS/nathan-west/code/volk/volk_arm/kernels/volk_arm/volk_arm_32f_x2_add_32f.h" 1 | |
vld1.32 {d0-d1}, [r1]! @ aVector | |
vld1.32 {d2-d3}, [r2]! @ bVector | |
vadd.f32 q1, q0, q1 | |
vst1.32 {d2-d3}, [r0]! @ cPtr | |
@ 0 "" 2 | |
.loc 1 208 0 discriminator 2 | |
add ip, ip, #1 @ number, number, | |
.LVL25: | |
cmp ip, r7 @ number, quarterPoints | |
bne .L32 @, | |
.LVL26: | |
.L31: | |
.loc 1 221 0 | |
mov r7, r7, asl #2 @ number, quarterPoints, | |
.LVL27: | |
.loc 1 222 0 | |
cmp r3, r7 @ num_points, number | |
bls .L30 @, | |
.loc 1 199 0 | |
add ip, r0, #16 @ D.17262, cPtr, | |
add r4, r1, #16 @ tmp242, aVector, | |
cmp r0, r4 @ cPtr, tmp242 | |
cmpcc r1, ip @, aVector, D.17262 | |
add r5, r2, #16 @ tmp248, bVector, | |
movcc r4, #0 @, tmp245 | |
movcs r4, #1 @, tmp245 | |
cmp r0, r5 @ cPtr, tmp248 | |
cmpcc r2, ip @, bVector, D.17262 | |
movcc ip, #0 @, tmp251 | |
movcs ip, #1 @, tmp251 | |
rsb r9, r7, r3 @ D.17246, number, num_points | |
and ip, r4, ip @ tmp254, tmp245, tmp251 | |
mov r8, r9, lsr #2 @ bnd.310, D.17246, | |
cmp r9, #3 @ D.17246, | |
movls ip, #0 @, tmp258 | |
andhi ip, ip, #1 @,, tmp258, tmp254 | |
eor ip, ip, #1 @ tmp260, tmp258, | |
mov sl, r8, asl #2 @ ratio_mult_vf.311, bnd.310, | |
cmp r8, #0 @ bnd.310, | |
orreq ip, ip, #1 @,, tmp266, tmp260 | |
cmp ip, #0 @ tmp266, | |
bne .L42 @, | |
mov r6, r1 @ ivtmp.391, aVector | |
mov r5, r2 @ ivtmp.394, bVector | |
mov r4, r0 @ ivtmp.396, cPtr | |
.LVL28: | |
.L35: | |
.loc 1 223 0 discriminator 2 | |
vld1.32 {q9}, [r5]! @ tmp267, MEM[(const float *)vect_p.321_111] | |
add ip, ip, #1 @ ivtmp.390, ivtmp.390, | |
cmp r8, ip @ bnd.310, ivtmp.390 | |
vld1.32 {q8}, [r6]! @ tmp268, MEM[(const float *)vect_p.316_107] | |
vadd.f32 q8, q9, q8 @ tmp269, tmp267, tmp268 | |
vst1.32 {q8}, [r4]! @ tmp269, MEM[(float *)vect_p.327_116] | |
bhi .L35 @, | |
.loc 1 199 0 | |
mov r5, sl, asl #2 @ D.17281, ratio_mult_vf.311, | |
cmp r9, sl @ D.17246, ratio_mult_vf.311 | |
add r7, r7, sl @ number, number, ratio_mult_vf.311 | |
add r0, r0, r5 @ cPtr, cPtr, D.17281 | |
add r6, r1, r5 @ aPtr, aVector, D.17281 | |
add r5, r2, r5 @ bPtr, bVector, D.17281 | |
beq .L30 @, | |
.L34: | |
add r2, r7, #1 @ D.17302, number, | |
.LVL29: | |
sub r8, r3, #7 @ D.17303, num_points, | |
cmp r3, r2 @ num_points, D.17302 | |
movcc r1, #0 @ tmp274, | |
.LVL30: | |
movcs r1, #1 @ tmp274, | |
cmp r2, r8 @ D.17302, D.17303 | |
movcs r1, #0 @, tmp274, | |
cmp r3, #6 @ num_points, | |
movls r2, #0 @, tmp280 | |
andhi r2, r1, #1 @,, tmp280, tmp274 | |
cmp r2, #0 @ tmp280, | |
beq .L43 @, | |
add ip, r0, #32 @ ivtmp.362, cPtr, | |
add r2, r6, #32 @ ivtmp.363, aPtr, | |
add r1, r5, #32 @ ivtmp.364, bPtr, | |
.L38: | |
.loc 1 223 0 | |
flds s14, [r1, #-32] @ MEM[base: bPtr_160, offset: 4294967264B], MEM[base: bPtr_160, offset: 4294967264B] | |
add r0, r7, #9 @ ivtmp.332, number, | |
flds s15, [r2, #-32] @ MEM[base: aPtr_161, offset: 4294967264B], MEM[base: aPtr_161, offset: 4294967264B] | |
cmp r8, r0 @ D.17303, ivtmp.332 | |
.loc 1 222 0 | |
add r4, r7, #8 @ number, number, | |
pld [r2, #48] @ | |
.loc 1 199 0 | |
mov r6, r2 @ aPtr, ivtmp.363 | |
mov r5, r1 @ bPtr, ivtmp.364 | |
mov r0, ip @ cPtr, ivtmp.362 | |
.loc 1 222 0 | |
mov r7, r4 @ number, number | |
.loc 1 223 0 | |
fadds s15, s14, s15 @ tmp282, MEM[base: bPtr_160, offset: 4294967264B], MEM[base: aPtr_161, offset: 4294967264B] | |
fsts s15, [ip, #-32] @ tmp282, MEM[base: cPtr_159, offset: 4294967264B] | |
flds s14, [r1, #-28] @ MEM[base: bPtr_160, offset: 4294967268B], MEM[base: bPtr_160, offset: 4294967268B] | |
flds s15, [r2, #-28] @ MEM[base: aPtr_161, offset: 4294967268B], MEM[base: aPtr_161, offset: 4294967268B] | |
fadds s15, s14, s15 @ tmp285, MEM[base: bPtr_160, offset: 4294967268B], MEM[base: aPtr_161, offset: 4294967268B] | |
fsts s15, [ip, #-28] @ tmp285, MEM[base: cPtr_159, offset: 4294967268B] | |
flds s14, [r1, #-24] @ MEM[base: bPtr_160, offset: 4294967272B], MEM[base: bPtr_160, offset: 4294967272B] | |
flds s15, [r2, #-24] @ MEM[base: aPtr_161, offset: 4294967272B], MEM[base: aPtr_161, offset: 4294967272B] | |
fadds s15, s14, s15 @ tmp288, MEM[base: bPtr_160, offset: 4294967272B], MEM[base: aPtr_161, offset: 4294967272B] | |
fsts s15, [ip, #-24] @ tmp288, MEM[base: cPtr_159, offset: 4294967272B] | |
flds s14, [r1, #-20] @ MEM[base: bPtr_160, offset: 4294967276B], MEM[base: bPtr_160, offset: 4294967276B] | |
flds s15, [r2, #-20] @ MEM[base: aPtr_161, offset: 4294967276B], MEM[base: aPtr_161, offset: 4294967276B] | |
fadds s15, s14, s15 @ tmp291, MEM[base: bPtr_160, offset: 4294967276B], MEM[base: aPtr_161, offset: 4294967276B] | |
fsts s15, [ip, #-20] @ tmp291, MEM[base: cPtr_159, offset: 4294967276B] | |
flds s14, [r1, #-16] @ MEM[base: bPtr_160, offset: 4294967280B], MEM[base: bPtr_160, offset: 4294967280B] | |
flds s15, [r2, #-16] @ MEM[base: aPtr_161, offset: 4294967280B], MEM[base: aPtr_161, offset: 4294967280B] | |
fadds s15, s14, s15 @ tmp294, MEM[base: bPtr_160, offset: 4294967280B], MEM[base: aPtr_161, offset: 4294967280B] | |
fsts s15, [ip, #-16] @ tmp294, MEM[base: cPtr_159, offset: 4294967280B] | |
flds s14, [r1, #-12] @ MEM[base: bPtr_160, offset: 4294967284B], MEM[base: bPtr_160, offset: 4294967284B] | |
flds s15, [r2, #-12] @ MEM[base: aPtr_161, offset: 4294967284B], MEM[base: aPtr_161, offset: 4294967284B] | |
fadds s15, s14, s15 @ tmp297, MEM[base: bPtr_160, offset: 4294967284B], MEM[base: aPtr_161, offset: 4294967284B] | |
fsts s15, [ip, #-12] @ tmp297, MEM[base: cPtr_159, offset: 4294967284B] | |
flds s14, [r1, #-8] @ MEM[base: bPtr_160, offset: 4294967288B], MEM[base: bPtr_160, offset: 4294967288B] | |
flds s15, [r2, #-8] @ MEM[base: aPtr_161, offset: 4294967288B], MEM[base: aPtr_161, offset: 4294967288B] | |
fadds s15, s14, s15 @ tmp300, MEM[base: bPtr_160, offset: 4294967288B], MEM[base: aPtr_161, offset: 4294967288B] | |
fsts s15, [ip, #-8] @ tmp300, MEM[base: cPtr_159, offset: 4294967288B] | |
flds s14, [r1, #-4] @ MEM[base: bPtr_160, offset: 4294967292B], MEM[base: bPtr_160, offset: 4294967292B] | |
add r1, r1, #32 @ ivtmp.364, ivtmp.364, | |
flds s15, [r2, #-4] @ MEM[base: aPtr_161, offset: 4294967292B], MEM[base: aPtr_161, offset: 4294967292B] | |
add r2, r2, #32 @ ivtmp.363, ivtmp.363, | |
fadds s15, s14, s15 @ tmp303, MEM[base: bPtr_160, offset: 4294967292B], MEM[base: aPtr_161, offset: 4294967292B] | |
fsts s15, [ip, #-4] @ tmp303, MEM[base: cPtr_159, offset: 4294967292B] | |
.LVL31: | |
add ip, ip, #32 @ ivtmp.362, ivtmp.362, | |
bhi .L38 @, | |
.LVL32: | |
.L37: | |
.loc 1 199 0 | |
sub r1, r6, #4 @ ivtmp.343, aPtr, | |
sub r2, r5, #4 @ ivtmp.347, bPtr, | |
.L39: | |
.loc 1 223 0 | |
add r2, r2, #4 @ ivtmp.347, ivtmp.347, | |
add r1, r1, #4 @ ivtmp.343, ivtmp.343, | |
flds s14, [r2, #0] @ MEM[base: D.17338_196, offset: 0B], MEM[base: D.17338_196, offset: 0B] | |
.loc 1 222 0 | |
add r4, r4, #1 @ number, number, | |
.loc 1 223 0 | |
flds s15, [r1, #0] @ MEM[base: D.17337_197, offset: 0B], MEM[base: D.17337_197, offset: 0B] | |
.loc 1 222 0 | |
cmp r3, r4 @ num_points, number | |
.loc 1 223 0 | |
fadds s15, s14, s15 @ tmp306, MEM[base: D.17338_196, offset: 0B], MEM[base: D.17337_197, offset: 0B] | |
fstmias r0!, {s15} @ cPtr, tmp306 | |
.LVL33: | |
.loc 1 222 0 | |
bhi .L39 @, | |
.LVL34: | |
.L30: | |
.loc 1 226 0 | |
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl} | |
bx lr | |
.L43: | |
.loc 1 199 0 | |
mov r4, r7 @ number, number | |
b .L37 @ | |
.LVL35: | |
.L42: | |
.loc 1 205 0 | |
mov r5, r2 @ bPtr, bVector | |
.loc 1 204 0 | |
mov r6, r1 @ aPtr, aVector | |
b .L34 @ | |
.cfi_endproc | |
.LFE1897: | |
.size volk_arm_32f_x2_add_32f_a_inlineneon, .-volk_arm_32f_x2_add_32f_a_inlineneon | |
.align 2 | |
.type volk_arm_32f_x2_multiply_32f_generic, %function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment