Created
March 8, 2018 10:44
-
-
Save madmann91/adedc899db0897e19bad6c0aa722b4da to your computer and use it in GitHub Desktop.
Fast division using precomputed multiplicative inverse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <time.h> | |
#define PAGE_SIZE 4096 | |
typedef struct asmbuf_s asmbuf_t; | |
struct asmbuf_s { | |
uint8_t code[PAGE_SIZE - sizeof(uint64_t)]; | |
uint64_t count; | |
}; | |
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | |
#include <sys/mman.h> | |
asmbuf_t* asmbuf_create(void) { | |
return mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
} | |
void asmbuf_finalize(asmbuf_t* buf) { | |
mprotect(buf, sizeof(asmbuf_t), PROT_READ | PROT_EXEC); | |
} | |
void asmbuf_free(asmbuf_t* buf) { | |
munmap(buf, PAGE_SIZE); | |
} | |
#elif defined(WIN32) | |
#include <windows.h> | |
asmbuf_t* asmbuf_create(void) { | |
return VirtualAlloc(NULL, PAGE_SIZE, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); | |
} | |
void asmbuf_finalize(asmbuf_t* buf) { | |
DWORD old; | |
VirtualProtect(buf, sizeof(*buf), PAGE_EXECUTE_READ, &old); | |
} | |
void asmbuf_free(asmbuf_t* buf) { | |
VirtualFree(buf, 0, MEM_RELEASE); | |
} | |
#else | |
#error "Cannot allocate asm buffer" | |
#endif | |
typedef struct fastdiv_s fastdiv_t; | |
struct fastdiv_s { | |
uint32_t m, s1, s2; | |
}; | |
fastdiv_t fastdiv_init(uint32_t div) { | |
uint32_t l = 1; | |
while ((1 << l) < div) ++l; | |
uint64_t max = 1ull << 32ull; | |
uint32_t m = (max << l) / div - max + 1; | |
uint32_t s1 = l < 1 ? l : 1; | |
uint32_t s2 = l > 1 ? l - 1 : 0; | |
return (fastdiv_t) { .m = m, .s1 = s1, .s2 = s2 }; | |
} | |
uint32_t fastdiv(fastdiv_t fd, uint32_t i) { | |
uint32_t t = ((uint64_t)fd.m * (uint64_t)i) >> 32ull; | |
return (t + ((i - t) >> fd.s1)) >> fd.s2; | |
} | |
asmbuf_t* asmbuf_gen_fastdiv(uint32_t div) { | |
asmbuf_t* buf = asmbuf_create(); | |
// power of 2 | |
if ((div & (div - 1)) == 0) { | |
uint32_t s = 0; | |
while ((div & 1) == 0) s++, div >>= 1; | |
// mov %edi, %eax | |
buf->code[buf->count++] = 0x89; | |
buf->code[buf->count++] = 0xf8; | |
// shr imm8, %eax | |
buf->code[buf->count++] = 0xc1; | |
buf->code[buf->count++] = 0xe8; | |
buf->code[buf->count++] = s; | |
// retq | |
buf->code[buf->count++] = 0xc3; | |
} else { | |
fastdiv_t fd = fastdiv_init(div); | |
// mov %edi, %eax | |
buf->code[buf->count++] = 0x89; | |
buf->code[buf->count++] = 0xf8; | |
// mov imm32, %edx | |
buf->code[buf->count++] = 0xba; | |
buf->code[buf->count++] = fd.m; | |
buf->code[buf->count++] = fd.m >> 8; | |
buf->code[buf->count++] = fd.m >> 16; | |
buf->code[buf->count++] = fd.m >> 24; | |
// mul %edx | |
buf->code[buf->count++] = 0xf7; | |
buf->code[buf->count++] = 0xe2; | |
// mov %edi, %eax | |
buf->code[buf->count++] = 0x89; | |
buf->code[buf->count++] = 0xf8; | |
// sub %edx, %eax | |
buf->code[buf->count++] = 0x29; | |
buf->code[buf->count++] = 0xd0; | |
if (fd.s1 > 0) { | |
// shr imm8, %eax | |
buf->code[buf->count++] = 0xc1; | |
buf->code[buf->count++] = 0xe8; | |
buf->code[buf->count++] = fd.s1; | |
} | |
// add %edx, %eax | |
buf->code[buf->count++] = 0x01; | |
buf->code[buf->count++] = 0xd0; | |
// shr imm8, %eax | |
if (fd.s2 > 0) { | |
// shr imm8, %eax | |
buf->code[buf->count++] = 0xc1; | |
buf->code[buf->count++] = 0xe8; | |
buf->code[buf->count++] = fd.s2; | |
} | |
// retq | |
buf->code[buf->count++] = 0xc3; | |
} | |
asmbuf_finalize(buf); | |
return buf; | |
} | |
int main(int argc, char** argv) { | |
if (argc < 2) | |
return 1; | |
uint32_t n = strtol(argv[1], NULL, 10); | |
clock_t start, end; | |
uint32_t sum; | |
asmbuf_t* buf = asmbuf_gen_fastdiv(n); | |
uint32_t (*div)(uint32_t) = (void*)buf->code; | |
start = clock(); | |
sum = 0; | |
for (uint32_t i = 0; i != 0xFFFFFFFF; ++i) | |
sum += div(i); | |
end = clock(); | |
asmbuf_free(buf); | |
printf("jit: %d - %d ms\n", sum, (end - start) * 1000 / CLOCKS_PER_SEC); | |
fastdiv_t fd = fastdiv_init(n); | |
start = clock(); | |
sum = 0; | |
for (uint32_t i = 0; i != 0xFFFFFFFF; ++i) | |
sum += fastdiv(fd, i); | |
end = clock(); | |
printf("fastdiv: %d - %d ms\n", sum, (end - start) * 1000 / CLOCKS_PER_SEC); | |
start = clock(); | |
sum = 0; | |
for (uint32_t i = 0; i != 0xFFFFFFFF; ++i) | |
sum += i / n; | |
end = clock(); | |
printf("ref: %d - %d ms\n", sum, (end - start) * 1000 / CLOCKS_PER_SEC); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment