Skip to content

Instantly share code, notes, and snippets.

@astocko
Last active December 29, 2015 00:29
Show Gist options
  • Save astocko/7586423 to your computer and use it in GitHub Desktop.
Save astocko/7586423 to your computer and use it in GitHub Desktop.
Fast sine approximation using SSE instructions.
char *ctable = "\x53\xBE\x25\xE6\xBC\xAB\xC4\x3E\x00\x00\x00\x00\x00\x00\x00"
"\x00\x7B\xFC\xAB\xE4\x24\xAA\xC4\x3E\x16\x46\x6F\x57\xCF\x00\xDF\x3E\xFB\x53"
"\x29\x1F\x5D\xA5\xC4\x3E\x47\x5F\xF3\x64\x6B\xFE\xEE\x3E\x2D\xE8\x53\x52\x66"
"\x9D\xC4\x3E\x3F\x66\xCA\xCB\xD3\x3B\xF7\x3E\x18\x49\x90\xB8\x41\x92\xC4\x3E"
"\x3B\xF9\x14\xB6\xDC\xF4\xFE\x3E\x53\xF8\xC0\x09\xF1\x83\xC4\x3E\x8B\xC8\x87"
"\xC9\x8F\x54\x03\x3F\xF2\x93\x02\x7B\x76\x72\xC4\x3E\x1B\x4C\x71\x20\xB6\x2B"
"\x07\x3F\xEA\xB1\x54\xBE\xD4\x5D\xC4\x3E\x84\x64\x87\xC4\x49\xFF\x0A\x3F\x64"
"\x79\x2F\x02\x0F\x46\xC4\x3E\xC1\x78\x97\xA7\xB3\xCE\x0E\x3F\x65\x0A\x06\xF1"
"\x28\x2B\xC4\x3E\x93\x8D\xE3\xAF\xAE\x4C\x11\x3F\x28\xC7\xB5\xB0\x26\x0D\xC4"
"\x3E\x9F\xE9\x62\x9F\x58\x2F\x13\x3F\x7B\x95\xE2\xE1\x0C\xEC\xC3\x3E\x1D\xFE"
"\x70\x34\x0D\x0F\x15\x3F\x5E\x31\x40\x9F\xE0\xC7\xC3\x3E\x75\x94\xFE\x75\x82"
"\xEB\x16\x3F\x12\xAD\xC8\x7C\xA7\xA0\xC3\x3E\x78\x41\x2E\xEB\x6E\xC4\x18\x3F"
"\xAD\x3D\xE0\x86\x67\x76\xC3\x3E\x58\xD0\xA8\xA6\x89\x99\x1A\x3F\x19\x76\x66"
"\x41\x27\x49\xC3\x3E\xA7\x29\xDC\x51\x8A\x6A\x1C\x3F\x6C\x15\xB5\xA6\xED\x18"
"\xC3\x3E\x8A\xFA\x22\x38\x29\x37\x1E\x3F\x1D\x90\x8C\x26\xC2\xE5\xC2\x3E\xA7"
"\x63\xD3\x51\x1F\xFF\x1F\x3F\xB5\x7E\xEE\xA4\xAC\xAF\xC2\x3E\xB1\x7D\x99\x27"
"\x13\xE1\x20\x3F\x29\x1F\xE6\x78\xB5\x76\xC2\x3E\x7C\x39\xA7\x51\xFC\xBF\x21"
"\x3F\xE9\x18\x3F\x6B\xE5\x3A\xC2\x3E\x3F\x99\x59\xC7\x28\x9C\x22\x3F\x76\xB6"
"\x2A\xB5\x45\xFC\xC1\x3E\x2B\xA2\x04\x95\x76\x75\x23\x3F\xE8\xC9\xD3\xFE\xDF"
"\xBA\xC1\x3E\xD1\xF6\x45\x38\xC4\x4B\x24\x3F\x96\x74\xE1\x5D\xBE\x76\xC1\x3E"
"\x2B\xAB\x2F\xA5\xF0\x1E\x25\x3F\xA0\x0C\xE9\x53\xEB\x2F\xC1\x3E\x78\xD4\x60"
"\x4B\xDB\xEE\x25\x3F\xC5\x5D\xCF\xCC\x71\xE6\xC0\x3E\xBF\x0B\x0B\x1B\x64\xBB"
"\x26\x3F\x79\x85\x19\x1D\x5D\x9A\xC0\x3E\xC1\x1C\xE4\x89\x6B\x84\x27\x3F\xB2"
"\xAC\x2D\x00\xB9\x4B\xC0\x3E\x27\x1E\x03\x98\xD2\x49\x28\x3F\xDC\xC8\x07\x2D"
"\x23\xF5\xBF\x3E\xEF\x32\xA8\xD4\x7A\x0B\x29\x3F\x9E\xD8\x8E\xC7\xE6\x4D\xBF"
"\x3E\x6D\x38\xEE\x62\x46\xC9\x29\x3F\x63\x5A\xD1\x99\xD6\xA1\xBE\x3E\x76\xA7"
"\x65\xFE\x17\x83\x2A\x3F\xB1\x67\x40\x2C\x0D\xF1\xBD\x3E\x11\xF3\x97\xFF\xD2"
"\x38\x2B\x3F\x7E\xF4\xC5\xC1\xA5\x3B\xBD\x3E\x62\xB2\x72\x60\x5B\xEA\x2B\x3F"
"\x85\xA0\x90\x53\xBC\x81\xBC\x3E\x76\xE7\x99\xC0\x95\x97\x2C\x3F\x57\x6D\xC3"
"\x8C\x6D\xC3\xBB\x3E\x4F\xB8\xA0\x69\x67\x40\x2D\x3F\x65\x03\x0A\xC6\xD6\x00"
"\xBB\x3E\x8B\xF3\x27\x53\xB6\xE4\x2D\x3F\x87\x34\x12\x01\x16\x3A\xBA\x3E\x09"
"\xBE\xE1\x26\x69\x84\x2E\x3F\xB3\x6E\xEB\xE3\x49\x6F\xB9\x3E\x19\xCB\x79\x44"
"\x67\x1F\x2F\x3F\x7B\xD5\x4C\xB4\x91\xA0\xB8\x3E\xE8\x84\x61\xC5\x98\xB5\x2F"
"\x3F\xF1\xBC\xC2\x52\x0D\xCE\xB7\x3E\xAC\xC7\x3F\x40\x73\x23\x30\x3F\x4C\x44"
"\xC4\x35\xDD\xF7\xB6\x3E\xD7\x09\xE1\x06\x9D\x69\x30\x3F\x5C\xD2\xB1\x64\x22"
"\x1E\xB6\x3E\x19\xA4\xC9\xE4\x3E\xAD\x30\x3F\x67\x39\xBD\x72\xFE\x40\xB5\x3E"
"\x9A\xD5\x17\x6C\x4E\xEE\x30\x3F\x86\x4B\xBC\x79\x93\x60\xB4\x3E\xF9\xB6\x6E"
"\x94\xC1\x2C\x31\x3F\xD8\xAB\xE6\x14\x04\x7D\xB3\x3E\x15\x48\x82\xBC\x8E\x68"
"\x31\x3F\x32\xAC\x7F\x5B\x73\x96\xB2\x3E\x34\x99\x93\xAB\xAC\xA1\x31\x3F\xED"
"\x09\x6D\xDB\x04\xAD\xB1\x3E\xD9\xD4\xDC\x92\x12\xD8\x31\x3F\x68\x5F\xBB\x93"
"\xDC\xC0\xB0\x3E\x25\xF2\xEC\x0E\xB8\x0B\x32\x3F\x80\x45\x22\xDE\x3D\xA4\xAF"
"\x3E\x2D\xDA\xF2\x28\x95\x3C\x32\x3F\x9B\x19\x22\x7C\xE1\xC1\xAD\x3E\x38\xCD"
"\xF7\x57\xA2\x6A\x32\x3F\x3D\x8C\x59\x63\xEE\xDA\xAB\x3E\x57\xD7\x08\x82\xD8"
"\x95\x32\x3F\x0B\x74\xD1\xAA\xAF\xEF\xA9\x3E\x8D\x26\x4F\xFD\x30\xBE\x32\x3F"
"\x56\xE4\x23\x13\x71\x00\xA8\x3E\x45\x17\x17\x91\xA5\xE3\x32\x3F\x48\xBE\xCD"
"\xFA\x7E\x0D\xA6\x3E\x7D\xCE\xC5\x76\x30\x06\x33\x3F\x51\xE9\x67\x52\x26\x17"
"\xA4\x3E\xD7\x3B\xBD\x5A\xCC\x25\x33\x3F\xB5\x03\xCA\x90\xB4\x1D\xA2\x3E\x57"
"\x60\x2E\x5D\x74\x42\x33\x3F\x8F\x5F\x18\xA7\x77\x21\xA0\x3E\x72\xB9\xD9\x12"
"\x24\x5C\x33\x3F\xCC\x47\x7E\xE9\x7B\x45\x9C\x3E\xAB\xB2\xBD\x85\xD7\x72\x33"
"\x3F\x85\xD8\xB8\x76\xAC\x43\x98\x3E\xF3\x02\xB3\x35\x8B\x86\x33\x3F\xC6\x87"
"\x36\x25\x1F\x3E\x94\x3E\x96\xDD\xF6\x18\x3C\x97\x33\x3F\xEA\xB7\x02\xB8\x72"
"\x35\x90\x3E\x78\xE1\xA2\x9C\xE7\xA4\x33\x3F\x3C\x79\xC5\xDA\x8C\x54\x88\x3E"
"\x1C\xB4\x12\xA5\x8B\xAF\x33\x3F\xA6\xFF\x8E\xCC\x73\x3A\x80\x3E\xC4\x38\x37"
"\x8E\x26\xB7\x33\x3F\x01\xBE\xBF\x36\xB4\x3B\x70\x3E\xE5\x56\xD7\x2B\xB7\xBB"
"\x33\x3F\x00\x00\x00\x00\x00\x00\x00\x00\xDE\x45\xBE\xC9\x3C\xBD\x33";
char *one_m230 = "\x00\x00\x80\xff\xff\xff\xef\x3f"
"\x00\x00\x00\x00\x00\x00\x00\x00";
char *pitable = "\x39\x79\x3E\x6D\x30\x5F\x44\x40\x48\x5F\x9D\xF0\xA7\x54\x70"
"\x3D\x39\x79\x3E\x6D\x30\x5F\x44\x40\x48\x5F\x9D\xF0\xA7\x54\x70\x3D\x88\x0C"
"\x3E\x72\x1B\xCC\xE7\x3F\xF5\xD1\x57\x27\xFC\x29\x15\x3D\x54\x70\x3D\xE4\xE4"
"\x36\x58\x3F\x9A\xFA\xE8\xAB\x13\xFE\xA4\x3C\x29\x15\x3D\x20\x72\x72\xEB\x3E"
"\x9C\xA6\x3E\xFA\xEA\x84\x4F\x3C\xF8\x83\x3C\xA9\x20\x72\x62\x3E\xE0\x6E\x9A"
"\xFA\xE8\xAB\xA3\x3B\x27\x1C\x3C\x4F\xA9\x20\xE2\x3D\x37\x70\x37\x4D\x7D\xF4"
"\x35\x3B\xF4\x35\x3B\x27\xFC\x29\x15\x3D\x29\xB6\x0D\xDC\x4D\x53\x5F\x3A\x46"
"\xFF\x3A\x75\xC2\x9F\xD2\x3C\xA5\xD8\x36\x70\x37\x4D\x3D\x3A\x50\x5F\x3A\x7D"
"\x75\xC2\x5F\x3C\xCD\x4A\xB1\x6D\xE0\x6E\xAA\x39\x4D\xD3\x39\xFA\xE8\xAB\xA3"
"\x3B\x79\x32\x2B\xC5\xB6\x81\x0B\x39\xDD\x94\x39\xA9\x8F\xBE\x6A\x3B\x88\x27"
"\xB3\x52\x6C\x1B\xC8\x38\xC0\x1D\x39\xA6\xA9\x8F\xEE\x3A\x72\x88\x27\xB3\x52"
"\x6C\x4B\x38\x60\x4B\x38\xDC\x4D\x53\x5F\x3A\x10\xE4\x10\x4F\x66\xA5\xB8\x37"
"\xC5\x06\x38\x0D\xDC\x4D\xD3\x39\xFE\x41\x90\x43\x3C\x99\x15";
float sin(float x) {
float res;
__asm {
movss xmm0, x
push rsi
//add rsp, 0x8
stmxcsr [rsp]
mov eax, [rsp]
and eax, 0xffff9fff
cmp eax, [rsp]
jnz label_1804
label_15d6:
pshuflw xmm1,xmm0,0x44
movd ecx,xmm0
cvtps2pd xmm1,xmm1
mov rax,0x40445f306e000000
movq xmm2,rax
mov r10d,0x7fffffff
mov r8d,0x49ffffff
and r10d,ecx
sub r8d,r10d
sub r10d,0x39800000
or r8d,r10d
jl label_170D
mov r11,0x4338000000000000
movq xmm4,r11
mulsd xmm2,xmm1
mov rdx,0xbe5b1bbead603d8b
movq xmm3,rdx
movapd xmm5,xmm2
addsd xmm2,xmm4
movd r9d,xmm2
subsd xmm2,xmm4
mulsd xmm1,xmm3
label_164a:
mov r10,[ctable]
mov eax,0x180
mov rdx,0x40c37423899a1558
movq xmm4,rdx
subsd xmm5,xmm2
add r9d,r9d
mov r8d,r9d
sar r9b,0x7
and eax,r8d
add r8b,r9b
xor r8b,r9b
addsd xmm1,xmm5
and r8d,0xfe
movsd xmm3,[r10+r8*8]
mov r11,0x40a9f02f6222c720
movq xmm0,r11
movddup xmm2,xmm1
mulsd xmm1,xmm1
movsd xmm5,[r10+r8*8+0x8]
mov r9d,eax
add r9d,0x80
and r9d,0x100
shl r9,0x37
mulsd xmm3,xmm2
movq xmm2,r9
subsd xmm0,xmm1
and eax,0x100
shl rax,0x37
subsd xmm4,xmm1
movq xmm1,rax
xorpd xmm4,xmm2
mulsd xmm0,xmm5
mulsd xmm3,xmm4
xorpd xmm0,xmm1
addsd xmm0,xmm3
mov eax,[rsp]
and eax,0xffff9fff
cmp eax,[rsp]
jnz label_1812
cvtpd2ps xmm0,xmm0
jmp label_182a
label_170D:
and ecx,0x7fffffff
cmp ecx,0x3e000000
ja label_1741
cvtss2sd xmm0,xmm0
mulsd xmm0,[one_m230]
mov eax,[rsp]
and eax,0xffff9fff
cmp eax,[rsp]
jnz label_1812
label_17fe:
cvtsd2ss xmm0,xmm0
jmp label_182a
label_1741:
movd eax,xmm0
and eax,0x7f800000
cmp eax,0x7f800000
jz label_17d5
shr r10d,0x17
sub r10d,+0x1b
and r10d,0xfff8
mov rax, [pitable]
movsd xmm3,[rax+r10*2]
movsd xmm5,[rax+r10*2+0x8]
mov r9,0xffffffffff000000
movq xmm2,r9
mov r11,0x4338000000000000
movq xmm4,r11
andpd xmm2,xmm3
psllq xmm3,0x28
mulsd xmm2,xmm1
mulsd xmm3,xmm1
mulsd xmm1,xmm5
movapd xmm0,xmm2
addsd xmm2,xmm3
movapd xmm5,xmm2
subsd xmm0,xmm2
addsd xmm2,xmm4
addsd xmm3,xmm0
movd r9d,xmm2
subsd xmm2,xmm4
addsd xmm1,xmm3
jmp label_164a
label_17d5:
subss xmm0,xmm0
mov eax,[rsp]
and eax,0xffff9fff
cmp eax,[rsp]
jz label_17fc
stmxcsr [rsp+0x4]
mov eax,[rsp]
and eax,0x6000
or [rsp+0x4],eax
ldmxcsr [rsp+0x4]
label_17fc:
jmp label_182a
cvtsd2ss xmm0,xmm0
jmp label_182a
label_1804:
mov [rsp+0x4],eax
ldmxcsr [rsp+0x4]
jmp label_15d6
label_1812:
stmxcsr [rsp+0x4]
mov eax,[rsp]
and eax,0x6000
or [rsp+0x4],eax
ldmxcsr [rsp+0x4]
jmp label_17fe
label_182a:
pop rcx
movss res, xmm0
}
return res;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment