Created
April 3, 2019 19:14
-
-
Save rygorous/422f8d8382eb61a55c087714c92f0d4e to your computer and use it in GitHub Desktop.
Binary search variants
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Baseline version without prefetch | |
static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr) | |
{ | |
LRM_hash_t hash = lrm_hash8(ptr); | |
// Jump-in: narrow down the search interval using the jump table | |
LRM_hash_t ji = hash >> lrm->jumpInShift; | |
S32 jump1 = lrm->jumpIn[ji]; | |
S32 jump2 = lrm->jumpIn[ji + 1]; | |
// Run the binary search, using the code from the LRM searcher | |
const LRMEntry * it = lrm->entries.data() + jump1; | |
size_t nleft = jump2 - jump1; | |
if (nleft > 0) | |
{ | |
while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft) | |
{ | |
const LRMEntry * mid = &it[half]; | |
// This reduction guarantees the two invariants above. It doesn't shrink nleft quite | |
// as quickly as it could but it has the advantage of being a very simple update. | |
// | |
// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract | |
// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the | |
// update rule slightly cheaper, even though it's a bit sloppy. | |
it = (mid->hash < hash) ? mid : it; // conditional move | |
nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1 | |
} | |
// The above loop shrank nleft down to 1. Do the final step. | |
RR_ASSERT(nleft == 1); | |
it += (it->hash < hash); | |
} | |
return it; | |
} | |
// Baseline version with prefetch | |
static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr) | |
{ | |
LRM_hash_t hash = lrm_hash8(ptr); | |
// Jump-in: narrow down the search interval using the jump table | |
LRM_hash_t ji = hash >> lrm->jumpInShift; | |
S32 jump1 = lrm->jumpIn[ji]; | |
S32 jump2 = lrm->jumpIn[ji + 1]; | |
// Run the binary search, using the code from the LRM searcher | |
const LRMEntry * it = lrm->entries.data() + jump1; | |
size_t nleft = jump2 - jump1; | |
if (nleft > 0) | |
{ | |
while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft) | |
{ | |
const LRMEntry * mid = &it[half]; | |
// Prefetch both possible paths for the next step | |
RR_PREFETCHR_CL(&it[half >> 1]); | |
RR_PREFETCHR_CL(&mid[half >> 1]); | |
// This reduction guarantees the two invariants above. It doesn't shrink nleft quite | |
// as quickly as it could but it has the advantage of being a very simple update. | |
// | |
// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract | |
// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the | |
// update rule slightly cheaper, even though it's a bit sloppy. | |
it = (mid->hash < hash) ? mid : it; // conditional move | |
nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1 | |
} | |
// The above loop shrank nleft down to 1. Do the final step. | |
RR_ASSERT(nleft == 1); | |
it += (it->hash < hash); | |
} | |
return it; | |
} | |
// Memory-pipelined: look up in large batches, getting as many independent | |
// cache misses into the pipe as possible before actually doing any of the | |
// loads. | |
template<int N> | |
static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it) | |
{ | |
const LRMEntry * entries = lrm->entries.data(); | |
LRM_hash_t hash[N]; | |
size_t nleft[N]; | |
size_t nleft_max = 0; | |
for (int i = 0; i < N; i++) | |
{ | |
hash[i] = lrm_hash8(ptr + i); | |
LRM_hash_t ji = hash[i] >> lrm->jumpInShift; | |
S32 jump1 = lrm->jumpIn[ji]; | |
S32 jump2 = lrm->jumpIn[ji + 1]; | |
it[i] = entries + jump1; | |
nleft[i] = jump2 - jump1; | |
nleft_max = RR_MAX(nleft_max, nleft[i]); | |
RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]); | |
} | |
// Run binary search iterations while at least one search has a | |
// non-trivial range left | |
while (nleft_max > 1) | |
{ | |
for (int i = 0; i < N; i++) | |
{ | |
size_t half = nleft[i] >> 1; | |
const LRMEntry * cur = it[i]; | |
const LRMEntry * mid = cur + half; | |
it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move | |
nleft[i] -= half; | |
RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]); | |
} | |
nleft_max -= nleft_max >> 1; | |
} | |
// Finalize the searches | |
for (int i = 0; i < N; i++) | |
{ | |
// Final step | |
RR_ASSERT(nleft[i] <= 1); | |
it[i] += (it[i]->hash < hash[i]); | |
} | |
} | |
// Results: | |
// SimpleProf :seconds calls count : clk/call clk/count min | |
// search_one : 1.9221 1 8388480 : 5581904066.0 665.42 665.42 <- baseline without PF | |
// search_one_pf : 1.4311 1 8388480 : 4155978905.0 495.44 495.44 <- baseline with PF | |
// search_multi8_pf2 : 0.8201 1 8388480 : 2381551257.0 283.91 283.91 | |
// search_multi16_pf2 : 0.6135 1 8388480 : 1781563475.0 212.38 212.38 | |
// search_multi32_pf2 : 0.5709 1 8388480 : 1657999650.0 197.65 197.65 | |
// search_multi64_pf2 : 0.5591 1 8388480 : 1623743724.0 193.57 193.57 | |
// search_multi128_pf2: 0.5739 1 8388480 : 1666734087.0 198.69 198.69 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment