Created
August 5, 2012 22:32
-
-
Save KindDragon/3267532 to your computer and use it in GitHub Desktop.
Test Asm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Test( float* kernel, float* src, float* dst, int n ) | |
{ | |
in al,dx | |
and esp,0FFFFFFF0h | |
sub esp,2Ch | |
__m128 zero = _mm_setzero_ps(); | |
__m128 x0 = zero; | |
__m128 x1 = zero; | |
__m128 x2 = zero; | |
__m128 x3 = zero; | |
__m128 f0; | |
__m128 f1; | |
__m128 f2; | |
__m128 f3; | |
// init filter | |
__m128 k0 = _mm_loadu_ps(kernel + 0); | |
__m128 k1 = _mm_loadu_ps(kernel + 4); | |
movups xmm6,xmmword ptr [ecx+10h] | |
movups xmm7,xmmword ptr [ecx] | |
f0 = _mm_shuffle_ps(k1, k1, _MM_SHUFFLE(0, 1, 2, 3)); | |
movaps xmm1,xmm6 | |
shufps xmm1,xmm6,1Bh | |
xorps xmm0,xmm0 | |
movaps xmmword ptr [esp+0Ch],xmm1 | |
f1 = _mm_shuffle_ps(k0, k0, _MM_SHUFFLE(0, 1, 2, 3)); | |
movaps xmm1,xmm7 | |
shufps xmm1,xmm7,1Bh | |
f2 = _mm_move_ss(k0, k1); | |
movss xmm7,xmm6 | |
f2 = _mm_shuffle_ps(f2, f2, _MM_SHUFFLE(0, 3, 2, 1)); | |
f3 = _mm_move_ss(k1, zero); | |
movss xmm6,xmm0 | |
push esi | |
movaps xmm5,xmm0 | |
movaps xmm2,xmm0 | |
movaps xmm3,xmm0 | |
movaps xmm4,xmm0 | |
movaps xmmword ptr [esp+20h],xmm1 | |
shufps xmm7,xmm7,39h | |
f3 = _mm_shuffle_ps(f3, f3, _MM_SHUFFLE(0, 3, 2, 1)); | |
shufps xmm6,xmm6,39h | |
mov ecx,0Eh | |
lea ecx,[ecx] | |
x0 = _mm_move_ss(x0, x1); | |
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1)); | |
x1 = _mm_move_ss(x1, x2); | |
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1)); | |
x2 = _mm_move_ss(x2, x3); | |
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1)); | |
x3 = _mm_move_ss(x3, zero); | |
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1)); | |
__m128 s = _mm_load1_ps(src++); | |
movss xmm1,dword ptr [edx] | |
movss xmm5,xmm2 | |
movss xmm2,xmm3 | |
movss xmm3,xmm4 | |
x0 = _mm_move_ss(x0, x1); | |
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1)); | |
x1 = _mm_move_ss(x1, x2); | |
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1)); | |
x2 = _mm_move_ss(x2, x3); | |
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1)); | |
x3 = _mm_move_ss(x3, zero); | |
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1)); | |
__m128 s = _mm_load1_ps(src++); | |
movss xmm4,xmm0 | |
shufps xmm1,xmm1,0 | |
x0 = _mm_add_ps(x0, _mm_mul_ps(f0, s)); | |
movaps xmm0,xmm1 | |
mulps xmm0,xmmword ptr [esp+10h] | |
shufps xmm5,xmm5,39h | |
shufps xmm2,xmm2,39h | |
shufps xmm3,xmm3,39h | |
addps xmm5,xmm0 | |
x1 = _mm_add_ps(x1, _mm_mul_ps(f1, s)); | |
movaps xmm0,xmm1 | |
mulps xmm0,xmmword ptr [esp+20h] | |
add edx,4 | |
shufps xmm4,xmm4,39h | |
addps xmm2,xmm0 | |
x2 = _mm_add_ps(x2, _mm_mul_ps(f2, s)); | |
movaps xmm0,xmm1 | |
mulps xmm0,xmm7 | |
x3 = _mm_add_ps(x3, _mm_mul_ps(f3, s)); | |
mulps xmm1,xmm6 | |
addps xmm3,xmm0 | |
xorps xmm0,xmm0 | |
addps xmm4,xmm1 | |
dec ecx | |
jne Test+50h (010B1050h) | |
// prime | |
#pragma loop( no_vector ) | |
for(int i=0; i<14; ++i) { | |
mov esi,dword ptr [n] | |
mov ecx,dword ptr [dst] | |
lea esp,[esp] | |
} | |
// pipeline | |
do { | |
x0 = _mm_move_ss(x0, x1); | |
x0 = _mm_shuffle_ps(x0, x0, _MM_SHUFFLE(0, 3, 2, 1)); | |
x1 = _mm_move_ss(x1, x2); | |
x1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(0, 3, 2, 1)); | |
x2 = _mm_move_ss(x2, x3); | |
x2 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(0, 3, 2, 1)); | |
x3 = _mm_move_ss(x3, zero); | |
x3 = _mm_shuffle_ps(x3, x3, _MM_SHUFFLE(0, 3, 2, 1)); | |
__m128 s = _mm_load1_ps(src++); | |
movss xmm1,dword ptr [edx] | |
xorps xmm0,xmm0 | |
movss xmm5,xmm2 | |
movss xmm2,xmm3 | |
movss xmm3,xmm4 | |
movss xmm4,xmm0 | |
shufps xmm1,xmm1,0 | |
x0 = _mm_add_ps(x0, _mm_mul_ps(f0, s)); | |
movaps xmm0,xmm1 | |
mulps xmm0,xmmword ptr [esp+10h] | |
shufps xmm5,xmm5,39h | |
shufps xmm2,xmm2,39h | |
x1 = _mm_add_ps(x1, _mm_mul_ps(f1, s)); | |
x2 = _mm_add_ps(x2, _mm_mul_ps(f2, s)); | |
x3 = _mm_add_ps(x3, _mm_mul_ps(f3, s)); | |
_mm_store_ss(dst++, x0); | |
mov eax,ecx | |
addps xmm5,xmm0 | |
movaps xmm0,xmm1 | |
mulps xmm0,xmmword ptr [esp+20h] | |
shufps xmm3,xmm3,39h | |
shufps xmm4,xmm4,39h | |
add ecx,4 | |
addps xmm2,xmm0 | |
movaps xmm0,xmm1 | |
mulps xmm0,xmm7 | |
mulps xmm1,xmm6 | |
lea edx,[edx+4] | |
addps xmm3,xmm0 | |
movss dword ptr [eax],xmm5 | |
addps xmm4,xmm1 | |
} while(--n); | |
dec esi | |
jne Test+0B0h (010B10B0h) | |
} | |
pop esi | |
mov esp,ebp | |
pop ebp | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment