Last active
December 29, 2015 00:29
-
-
Save astocko/7586423 to your computer and use it in GitHub Desktop.
Fast sine approximation using SSE instructions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
char *ctable = "\x53\xBE\x25\xE6\xBC\xAB\xC4\x3E\x00\x00\x00\x00\x00\x00\x00" | |
"\x00\x7B\xFC\xAB\xE4\x24\xAA\xC4\x3E\x16\x46\x6F\x57\xCF\x00\xDF\x3E\xFB\x53" | |
"\x29\x1F\x5D\xA5\xC4\x3E\x47\x5F\xF3\x64\x6B\xFE\xEE\x3E\x2D\xE8\x53\x52\x66" | |
"\x9D\xC4\x3E\x3F\x66\xCA\xCB\xD3\x3B\xF7\x3E\x18\x49\x90\xB8\x41\x92\xC4\x3E" | |
"\x3B\xF9\x14\xB6\xDC\xF4\xFE\x3E\x53\xF8\xC0\x09\xF1\x83\xC4\x3E\x8B\xC8\x87" | |
"\xC9\x8F\x54\x03\x3F\xF2\x93\x02\x7B\x76\x72\xC4\x3E\x1B\x4C\x71\x20\xB6\x2B" | |
"\x07\x3F\xEA\xB1\x54\xBE\xD4\x5D\xC4\x3E\x84\x64\x87\xC4\x49\xFF\x0A\x3F\x64" | |
"\x79\x2F\x02\x0F\x46\xC4\x3E\xC1\x78\x97\xA7\xB3\xCE\x0E\x3F\x65\x0A\x06\xF1" | |
"\x28\x2B\xC4\x3E\x93\x8D\xE3\xAF\xAE\x4C\x11\x3F\x28\xC7\xB5\xB0\x26\x0D\xC4" | |
"\x3E\x9F\xE9\x62\x9F\x58\x2F\x13\x3F\x7B\x95\xE2\xE1\x0C\xEC\xC3\x3E\x1D\xFE" | |
"\x70\x34\x0D\x0F\x15\x3F\x5E\x31\x40\x9F\xE0\xC7\xC3\x3E\x75\x94\xFE\x75\x82" | |
"\xEB\x16\x3F\x12\xAD\xC8\x7C\xA7\xA0\xC3\x3E\x78\x41\x2E\xEB\x6E\xC4\x18\x3F" | |
"\xAD\x3D\xE0\x86\x67\x76\xC3\x3E\x58\xD0\xA8\xA6\x89\x99\x1A\x3F\x19\x76\x66" | |
"\x41\x27\x49\xC3\x3E\xA7\x29\xDC\x51\x8A\x6A\x1C\x3F\x6C\x15\xB5\xA6\xED\x18" | |
"\xC3\x3E\x8A\xFA\x22\x38\x29\x37\x1E\x3F\x1D\x90\x8C\x26\xC2\xE5\xC2\x3E\xA7" | |
"\x63\xD3\x51\x1F\xFF\x1F\x3F\xB5\x7E\xEE\xA4\xAC\xAF\xC2\x3E\xB1\x7D\x99\x27" | |
"\x13\xE1\x20\x3F\x29\x1F\xE6\x78\xB5\x76\xC2\x3E\x7C\x39\xA7\x51\xFC\xBF\x21" | |
"\x3F\xE9\x18\x3F\x6B\xE5\x3A\xC2\x3E\x3F\x99\x59\xC7\x28\x9C\x22\x3F\x76\xB6" | |
"\x2A\xB5\x45\xFC\xC1\x3E\x2B\xA2\x04\x95\x76\x75\x23\x3F\xE8\xC9\xD3\xFE\xDF" | |
"\xBA\xC1\x3E\xD1\xF6\x45\x38\xC4\x4B\x24\x3F\x96\x74\xE1\x5D\xBE\x76\xC1\x3E" | |
"\x2B\xAB\x2F\xA5\xF0\x1E\x25\x3F\xA0\x0C\xE9\x53\xEB\x2F\xC1\x3E\x78\xD4\x60" | |
"\x4B\xDB\xEE\x25\x3F\xC5\x5D\xCF\xCC\x71\xE6\xC0\x3E\xBF\x0B\x0B\x1B\x64\xBB" | |
"\x26\x3F\x79\x85\x19\x1D\x5D\x9A\xC0\x3E\xC1\x1C\xE4\x89\x6B\x84\x27\x3F\xB2" | |
"\xAC\x2D\x00\xB9\x4B\xC0\x3E\x27\x1E\x03\x98\xD2\x49\x28\x3F\xDC\xC8\x07\x2D" | |
"\x23\xF5\xBF\x3E\xEF\x32\xA8\xD4\x7A\x0B\x29\x3F\x9E\xD8\x8E\xC7\xE6\x4D\xBF" | |
"\x3E\x6D\x38\xEE\x62\x46\xC9\x29\x3F\x63\x5A\xD1\x99\xD6\xA1\xBE\x3E\x76\xA7" | |
"\x65\xFE\x17\x83\x2A\x3F\xB1\x67\x40\x2C\x0D\xF1\xBD\x3E\x11\xF3\x97\xFF\xD2" | |
"\x38\x2B\x3F\x7E\xF4\xC5\xC1\xA5\x3B\xBD\x3E\x62\xB2\x72\x60\x5B\xEA\x2B\x3F" | |
"\x85\xA0\x90\x53\xBC\x81\xBC\x3E\x76\xE7\x99\xC0\x95\x97\x2C\x3F\x57\x6D\xC3" | |
"\x8C\x6D\xC3\xBB\x3E\x4F\xB8\xA0\x69\x67\x40\x2D\x3F\x65\x03\x0A\xC6\xD6\x00" | |
"\xBB\x3E\x8B\xF3\x27\x53\xB6\xE4\x2D\x3F\x87\x34\x12\x01\x16\x3A\xBA\x3E\x09" | |
"\xBE\xE1\x26\x69\x84\x2E\x3F\xB3\x6E\xEB\xE3\x49\x6F\xB9\x3E\x19\xCB\x79\x44" | |
"\x67\x1F\x2F\x3F\x7B\xD5\x4C\xB4\x91\xA0\xB8\x3E\xE8\x84\x61\xC5\x98\xB5\x2F" | |
"\x3F\xF1\xBC\xC2\x52\x0D\xCE\xB7\x3E\xAC\xC7\x3F\x40\x73\x23\x30\x3F\x4C\x44" | |
"\xC4\x35\xDD\xF7\xB6\x3E\xD7\x09\xE1\x06\x9D\x69\x30\x3F\x5C\xD2\xB1\x64\x22" | |
"\x1E\xB6\x3E\x19\xA4\xC9\xE4\x3E\xAD\x30\x3F\x67\x39\xBD\x72\xFE\x40\xB5\x3E" | |
"\x9A\xD5\x17\x6C\x4E\xEE\x30\x3F\x86\x4B\xBC\x79\x93\x60\xB4\x3E\xF9\xB6\x6E" | |
"\x94\xC1\x2C\x31\x3F\xD8\xAB\xE6\x14\x04\x7D\xB3\x3E\x15\x48\x82\xBC\x8E\x68" | |
"\x31\x3F\x32\xAC\x7F\x5B\x73\x96\xB2\x3E\x34\x99\x93\xAB\xAC\xA1\x31\x3F\xED" | |
"\x09\x6D\xDB\x04\xAD\xB1\x3E\xD9\xD4\xDC\x92\x12\xD8\x31\x3F\x68\x5F\xBB\x93" | |
"\xDC\xC0\xB0\x3E\x25\xF2\xEC\x0E\xB8\x0B\x32\x3F\x80\x45\x22\xDE\x3D\xA4\xAF" | |
"\x3E\x2D\xDA\xF2\x28\x95\x3C\x32\x3F\x9B\x19\x22\x7C\xE1\xC1\xAD\x3E\x38\xCD" | |
"\xF7\x57\xA2\x6A\x32\x3F\x3D\x8C\x59\x63\xEE\xDA\xAB\x3E\x57\xD7\x08\x82\xD8" | |
"\x95\x32\x3F\x0B\x74\xD1\xAA\xAF\xEF\xA9\x3E\x8D\x26\x4F\xFD\x30\xBE\x32\x3F" | |
"\x56\xE4\x23\x13\x71\x00\xA8\x3E\x45\x17\x17\x91\xA5\xE3\x32\x3F\x48\xBE\xCD" | |
"\xFA\x7E\x0D\xA6\x3E\x7D\xCE\xC5\x76\x30\x06\x33\x3F\x51\xE9\x67\x52\x26\x17" | |
"\xA4\x3E\xD7\x3B\xBD\x5A\xCC\x25\x33\x3F\xB5\x03\xCA\x90\xB4\x1D\xA2\x3E\x57" | |
"\x60\x2E\x5D\x74\x42\x33\x3F\x8F\x5F\x18\xA7\x77\x21\xA0\x3E\x72\xB9\xD9\x12" | |
"\x24\x5C\x33\x3F\xCC\x47\x7E\xE9\x7B\x45\x9C\x3E\xAB\xB2\xBD\x85\xD7\x72\x33" | |
"\x3F\x85\xD8\xB8\x76\xAC\x43\x98\x3E\xF3\x02\xB3\x35\x8B\x86\x33\x3F\xC6\x87" | |
"\x36\x25\x1F\x3E\x94\x3E\x96\xDD\xF6\x18\x3C\x97\x33\x3F\xEA\xB7\x02\xB8\x72" | |
"\x35\x90\x3E\x78\xE1\xA2\x9C\xE7\xA4\x33\x3F\x3C\x79\xC5\xDA\x8C\x54\x88\x3E" | |
"\x1C\xB4\x12\xA5\x8B\xAF\x33\x3F\xA6\xFF\x8E\xCC\x73\x3A\x80\x3E\xC4\x38\x37" | |
"\x8E\x26\xB7\x33\x3F\x01\xBE\xBF\x36\xB4\x3B\x70\x3E\xE5\x56\xD7\x2B\xB7\xBB" | |
"\x33\x3F\x00\x00\x00\x00\x00\x00\x00\x00\xDE\x45\xBE\xC9\x3C\xBD\x33"; | |
char *one_m230 = "\x00\x00\x80\xff\xff\xff\xef\x3f" | |
"\x00\x00\x00\x00\x00\x00\x00\x00"; | |
char *pitable = "\x39\x79\x3E\x6D\x30\x5F\x44\x40\x48\x5F\x9D\xF0\xA7\x54\x70" | |
"\x3D\x39\x79\x3E\x6D\x30\x5F\x44\x40\x48\x5F\x9D\xF0\xA7\x54\x70\x3D\x88\x0C" | |
"\x3E\x72\x1B\xCC\xE7\x3F\xF5\xD1\x57\x27\xFC\x29\x15\x3D\x54\x70\x3D\xE4\xE4" | |
"\x36\x58\x3F\x9A\xFA\xE8\xAB\x13\xFE\xA4\x3C\x29\x15\x3D\x20\x72\x72\xEB\x3E" | |
"\x9C\xA6\x3E\xFA\xEA\x84\x4F\x3C\xF8\x83\x3C\xA9\x20\x72\x62\x3E\xE0\x6E\x9A" | |
"\xFA\xE8\xAB\xA3\x3B\x27\x1C\x3C\x4F\xA9\x20\xE2\x3D\x37\x70\x37\x4D\x7D\xF4" | |
"\x35\x3B\xF4\x35\x3B\x27\xFC\x29\x15\x3D\x29\xB6\x0D\xDC\x4D\x53\x5F\x3A\x46" | |
"\xFF\x3A\x75\xC2\x9F\xD2\x3C\xA5\xD8\x36\x70\x37\x4D\x3D\x3A\x50\x5F\x3A\x7D" | |
"\x75\xC2\x5F\x3C\xCD\x4A\xB1\x6D\xE0\x6E\xAA\x39\x4D\xD3\x39\xFA\xE8\xAB\xA3" | |
"\x3B\x79\x32\x2B\xC5\xB6\x81\x0B\x39\xDD\x94\x39\xA9\x8F\xBE\x6A\x3B\x88\x27" | |
"\xB3\x52\x6C\x1B\xC8\x38\xC0\x1D\x39\xA6\xA9\x8F\xEE\x3A\x72\x88\x27\xB3\x52" | |
"\x6C\x4B\x38\x60\x4B\x38\xDC\x4D\x53\x5F\x3A\x10\xE4\x10\x4F\x66\xA5\xB8\x37" | |
"\xC5\x06\x38\x0D\xDC\x4D\xD3\x39\xFE\x41\x90\x43\x3C\x99\x15"; | |
float sin(float x) { | |
float res; | |
__asm { | |
movss xmm0, x | |
push rsi | |
//add rsp, 0x8 | |
stmxcsr [rsp] | |
mov eax, [rsp] | |
and eax, 0xffff9fff | |
cmp eax, [rsp] | |
jnz label_1804 | |
label_15d6: | |
pshuflw xmm1,xmm0,0x44 | |
movd ecx,xmm0 | |
cvtps2pd xmm1,xmm1 | |
mov rax,0x40445f306e000000 | |
movq xmm2,rax | |
mov r10d,0x7fffffff | |
mov r8d,0x49ffffff | |
and r10d,ecx | |
sub r8d,r10d | |
sub r10d,0x39800000 | |
or r8d,r10d | |
jl label_170D | |
mov r11,0x4338000000000000 | |
movq xmm4,r11 | |
mulsd xmm2,xmm1 | |
mov rdx,0xbe5b1bbead603d8b | |
movq xmm3,rdx | |
movapd xmm5,xmm2 | |
addsd xmm2,xmm4 | |
movd r9d,xmm2 | |
subsd xmm2,xmm4 | |
mulsd xmm1,xmm3 | |
label_164a: | |
mov r10,[ctable] | |
mov eax,0x180 | |
mov rdx,0x40c37423899a1558 | |
movq xmm4,rdx | |
subsd xmm5,xmm2 | |
add r9d,r9d | |
mov r8d,r9d | |
sar r9b,0x7 | |
and eax,r8d | |
add r8b,r9b | |
xor r8b,r9b | |
addsd xmm1,xmm5 | |
and r8d,0xfe | |
movsd xmm3,[r10+r8*8] | |
mov r11,0x40a9f02f6222c720 | |
movq xmm0,r11 | |
movddup xmm2,xmm1 | |
mulsd xmm1,xmm1 | |
movsd xmm5,[r10+r8*8+0x8] | |
mov r9d,eax | |
add r9d,0x80 | |
and r9d,0x100 | |
shl r9,0x37 | |
mulsd xmm3,xmm2 | |
movq xmm2,r9 | |
subsd xmm0,xmm1 | |
and eax,0x100 | |
shl rax,0x37 | |
subsd xmm4,xmm1 | |
movq xmm1,rax | |
xorpd xmm4,xmm2 | |
mulsd xmm0,xmm5 | |
mulsd xmm3,xmm4 | |
xorpd xmm0,xmm1 | |
addsd xmm0,xmm3 | |
mov eax,[rsp] | |
and eax,0xffff9fff | |
cmp eax,[rsp] | |
jnz label_1812 | |
cvtpd2ps xmm0,xmm0 | |
jmp label_182a | |
label_170D: | |
and ecx,0x7fffffff | |
cmp ecx,0x3e000000 | |
ja label_1741 | |
cvtss2sd xmm0,xmm0 | |
mulsd xmm0,[one_m230] | |
mov eax,[rsp] | |
and eax,0xffff9fff | |
cmp eax,[rsp] | |
jnz label_1812 | |
label_17fe: | |
cvtsd2ss xmm0,xmm0 | |
jmp label_182a | |
label_1741: | |
movd eax,xmm0 | |
and eax,0x7f800000 | |
cmp eax,0x7f800000 | |
jz label_17d5 | |
shr r10d,0x17 | |
sub r10d,+0x1b | |
and r10d,0xfff8 | |
mov rax, [pitable] | |
movsd xmm3,[rax+r10*2] | |
movsd xmm5,[rax+r10*2+0x8] | |
mov r9,0xffffffffff000000 | |
movq xmm2,r9 | |
mov r11,0x4338000000000000 | |
movq xmm4,r11 | |
andpd xmm2,xmm3 | |
psllq xmm3,0x28 | |
mulsd xmm2,xmm1 | |
mulsd xmm3,xmm1 | |
mulsd xmm1,xmm5 | |
movapd xmm0,xmm2 | |
addsd xmm2,xmm3 | |
movapd xmm5,xmm2 | |
subsd xmm0,xmm2 | |
addsd xmm2,xmm4 | |
addsd xmm3,xmm0 | |
movd r9d,xmm2 | |
subsd xmm2,xmm4 | |
addsd xmm1,xmm3 | |
jmp label_164a | |
label_17d5: | |
subss xmm0,xmm0 | |
mov eax,[rsp] | |
and eax,0xffff9fff | |
cmp eax,[rsp] | |
jz label_17fc | |
stmxcsr [rsp+0x4] | |
mov eax,[rsp] | |
and eax,0x6000 | |
or [rsp+0x4],eax | |
ldmxcsr [rsp+0x4] | |
label_17fc: | |
jmp label_182a | |
cvtsd2ss xmm0,xmm0 | |
jmp label_182a | |
label_1804: | |
mov [rsp+0x4],eax | |
ldmxcsr [rsp+0x4] | |
jmp label_15d6 | |
label_1812: | |
stmxcsr [rsp+0x4] | |
mov eax,[rsp] | |
and eax,0x6000 | |
or [rsp+0x4],eax | |
ldmxcsr [rsp+0x4] | |
jmp label_17fe | |
label_182a: | |
pop rcx | |
movss res, xmm0 | |
} | |
return res; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment