Skip to content

Instantly share code, notes, and snippets.

@kusma
Last active August 30, 2015 11:54
Show Gist options
  • Save kusma/134730 to your computer and use it in GitHub Desktop.
Save kusma/134730 to your computer and use it in GitHub Desktop.
#ifdef USE_ASM
#ifdef ARM
inline __attribute__((const))
unsigned int
clz_iasm(unsigned int num)
{
unsigned int ret, tmp;
const unsigned char *lut = &math::clz8_lut[0];
asm (
"mov %[tmp], #0 \n"
"cmp %[num], #65536 \n"
"movcs %[num], %[num], lsr #16 \n"
"addcc %[tmp], %[tmp], #16 \n"
"cmp %[num], #256 \n"
"movcs %[num], %[num], lsr #8 \n"
"addcc %[tmp], %[tmp], #8 \n"
"ldrb %[ret], [%[lut], %[num]] \n"
"add %[ret], %[ret], %[tmp] \n"
: [ret] "=&r"(ret),
[tmp] "=&r"(tmp)
: [num] "0"(num),
[lut] "rm"(lut)
: "cc", "1"
);
return ret;
}
#else // !defined(ARM)
// note: quick test reveal that this is actually SLOWER than the C-version!
inline __attribute__((const))
unsigned int
clz(unsigned int num)
{
unsigned int ret, tmp;
const unsigned char *lut = &clz8_lut[0];
unsigned int mag = 1 << 16;
asm volatile (
"mov %[mag], #1 \n"
"lsl %[mag], #16 \n"
"mov %[tmp], #24 \n"
"cmp %[num], %[mag] \n"
"bcc 1f \n"
"lsr %[num], %[num], #16 \n"
"sub %[tmp], %[tmp], #16 \n"
"1: \n"
"lsr %[mag], %[mag], #8 \n"
"cmp %[num], %[mag] \n"
"bcc 2f \n"
"lsr %[num], %[num], #8 \n"
"sub %[tmp], %[tmp], #8 \n"
"2: \n"
"ldrb %[ret], [%[lut], %[num]] \n"
"add %[ret], %[ret], %[tmp] \n"
: [ret] "=r"(ret),
[tmp] "=l"(tmp),
[mag] "=l"(mag)
: [num] "0"(num),
[lut] "l"(lut)
: "cc"
);
return ret;
}
#endif // !defined(ARM)
#else // !defined(USE_ASM)
inline __attribute__((const))
unsigned int
clz(unsigned int num)
{
unsigned int ret = 0;
if (num < (1 << 16))
ret += 16;
else
num >>= 16;
if (num < (1 << 8))
ret += 8;
else
num >>= 8;
return ret + clz_lut[num];
}
#endif // !defined(USE_ASM)
const
unsigned char
clz8_lut[256] = {
8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
/* about the same speed as C-code + 256b ROM LUT, if clz4_lut is in IWRAM */
const unsigned char clz4_lut[16] = { 4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0 };
inline __attribute__((const))
unsigned int
clz_iasm(unsigned int num)
{
unsigned int ret, tmp;
const unsigned char *lut = &clz4_lut[0];
asm (
"mov %[tmp], #0 \n"
"cmp %[num], #65536 \n"
"movcs %[num], %[num], lsr #16 \n"
"addcc %[tmp], %[tmp], #16 \n"
"cmp %[num], #256 \n"
"movcs %[num], %[num], lsr #8 \n"
"addcc %[tmp], %[tmp], #8 \n"
"cmp %[num], #16 \n"
"movcs %[num], %[num], lsr #4 \n"
"addcc %[tmp], %[tmp], #4 \n"
"ldrb %[ret], [%[lut], %[num]] \n"
"add %[ret], %[ret], %[tmp] \n"
: [ret] "=&r"(ret),
[tmp] "=&r"(tmp)
: [num] "0"(num),
[lut] "rm"(lut)
: "cc", "1"
);
return ret;
}
int
test_clz()
{
int i, j;
int dummy1 = 0;
int dummy2 = 0;
blanks = 0;
for (i = 0; i < 1024 * 1024 * 2; ++i)
dummy1 += math::clz32(dummy1);
iprintf("math::clz32: %d blanks\n", blanks);
blanks = 0;
for (i = 0; i < 1024 * 1024 * 2; ++i)
dummy2 += clz_iasm(dummy2);
iprintf("clz_iasm: %d blanks\n", blanks);
iprintf("%d, %d\n", dummy1, dummy2);
for (i = 0; i < 31; ++i) {
for (j = 0; j < 3; ++j) {
int correct = math::clz32((1 << i) - 1 + j);
int test = clz_iasm((1 << i) - 1 + j);
if (correct != test) {
iprintf("* %d (%d, %d)\n", (1 << i) - 1 + j, correct, test);
while (1);
VBlankIntrWait();
}
}
}
iprintf("done\n");
return dummy1 + dummy2;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment