-
-
Save orlp/3551590 to your computer and use it in GitHub Desktop.
int64_t ipow(int64_t base, uint8_t exp) { | |
static const uint8_t highest_bit_set[] = { | |
0, 1, 2, 2, 3, 3, 3, 3, | |
4, 4, 4, 4, 4, 4, 4, 4, | |
5, 5, 5, 5, 5, 5, 5, 5, | |
5, 5, 5, 5, 5, 5, 5, 5, | |
6, 6, 6, 6, 6, 6, 6, 6, | |
6, 6, 6, 6, 6, 6, 6, 6, | |
6, 6, 6, 6, 6, 6, 6, 6, | |
6, 6, 6, 6, 6, 6, 6, 255, // anything past 63 is a guaranteed overflow with base > 1 | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
255, 255, 255, 255, 255, 255, 255, 255, | |
}; | |
int64_t result = 1; | |
switch (highest_bit_set[exp]) { | |
case 255: // we use 255 as an overflow marker and return 0 on overflow/underflow | |
if (base == 1) { | |
return 1; | |
} | |
if (base == -1) { | |
return 1 - 2 * (exp & 1); | |
} | |
return 0; | |
case 6: | |
if (exp & 1) result *= base; | |
exp >>= 1; | |
base *= base; | |
case 5: | |
if (exp & 1) result *= base; | |
exp >>= 1; | |
base *= base; | |
case 4: | |
if (exp & 1) result *= base; | |
exp >>= 1; | |
base *= base; | |
case 3: | |
if (exp & 1) result *= base; | |
exp >>= 1; | |
base *= base; | |
case 2: | |
if (exp & 1) result *= base; | |
exp >>= 1; | |
base *= base; | |
case 1: | |
if (exp & 1) result *= base; | |
default: | |
return result; | |
} | |
} |
Use switch fallthroughs.
That is some dang clever loop unrolling.
Wouldn't renaming 255 to 7 make this code easier to turn into an array lookup for the compiler ?
@ian-abbott Have you tested performance for static and non-static "highest_bit_set version"?
@Zorgatone yes you need to add, It will be equal to 7. And you need to add switch-case 7: . + Change signed parameters/values to unsigned. Or if you are sure the results you are aiming are <= int_64_max then of course you can use this function straight away ( in that case casting rules of C applies )
@MOJNICK If highest_bit_set
is non-static, the code initializes the array contents every time the function is called, so it is slower than the static version.
the lookup table for highest bit set can be replaced with __builtin_clz
if it is available (gcc, clang). It will make it faster (especially if the lookup table is not in cache, which is likely in a real-world application doing other things than that) and smaller.
Simply replace highest_bit_set[exp]
with (exp>0) ? 32 - __builtin_clz(exp) : 0
your code should end the cases sooner - when exp
is down to { 0 , 1 , 2 }
Cuz from there it's very obvious what needs to be returned :::
return { <= 0 := result
1 := result * base
2 := result * base * base
}
or more succinctly ::
return ( base ** exp ) * result
Because when exp == 2
, you already know ahead of time that ( exp >>= 1 ) & 1
must be true, so why bother evaluating the obvious ?
Another speed up trick you can contemplate is that if you already know up-front that the exponent is 1 short of a power of 2, then you already know every single bit is a 1.
In that case, make it into a vanilla countdown loop for # of bits, and simply doing the exact same thing every round without bothering to either right shift or check for ( exp & 1 )
. In my own code, for this scenario, I usually make the starting point slightly lagged, so each round I simply do a compound statement of
result *= base *= base # intentionally lagged, so one must be careful regarding when you
# need to perform the extra one to sync them back up with each other
The optimization here is at a whole new level. Thanks for sharing and well done sir!