rygorous · April 3, 2019 19:14
diff --git a/gistfile1.txt b/gistfile1.txt
 // Baseline version without prefetch
 static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr)
 {
 	LRM_hash_t hash = lrm_hash8(ptr);

 	// Jump-in: narrow down the search interval using the jump table
 	LRM_hash_t ji = hash >> lrm->jumpInShift;
 	S32 jump1 = lrm->jumpIn[ji];
 	S32 jump2 = lrm->jumpIn[ji + 1];

 	// Run the binary search, using the code from the LRM searcher
 	const LRMEntry * it = lrm->entries.data() + jump1;
 	size_t nleft = jump2 - jump1;

 	if (nleft > 0)
 	{
 		while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
 		{
 			const LRMEntry * mid = &it[half];

 			// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
 			// as quickly as it could but it has the advantage of being a very simple update.
 			//
 			// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
 			// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
 			// update rule slightly cheaper, even though it's a bit sloppy.
 			it = (mid->hash < hash) ? mid : it; // conditional move
 			nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
 		}

 		// The above loop shrank nleft down to 1. Do the final step.
 		RR_ASSERT(nleft == 1);
 		it += (it->hash < hash);
 	}

 	return it;
 }

 // Baseline version with prefetch
 static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr)
 {
 	LRM_hash_t hash = lrm_hash8(ptr);

 	// Jump-in: narrow down the search interval using the jump table
 	LRM_hash_t ji = hash >> lrm->jumpInShift;
 	S32 jump1 = lrm->jumpIn[ji];
 	S32 jump2 = lrm->jumpIn[ji + 1];

 	// Run the binary search, using the code from the LRM searcher
 	const LRMEntry * it = lrm->entries.data() + jump1;
 	size_t nleft = jump2 - jump1;

 	if (nleft > 0)
 	{
 		while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
 		{
 			const LRMEntry * mid = &it[half];

 			// Prefetch both possible paths for the next step
 			RR_PREFETCHR_CL(&it[half >> 1]);
 			RR_PREFETCHR_CL(&mid[half >> 1]);

 			// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
 			// as quickly as it could but it has the advantage of being a very simple update.
 			//
 			// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
 			// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
 			// update rule slightly cheaper, even though it's a bit sloppy.
 			it = (mid->hash < hash) ? mid : it; // conditional move
 			nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
 		}

 		// The above loop shrank nleft down to 1. Do the final step.
 		RR_ASSERT(nleft == 1);
 		it += (it->hash < hash);
 	}

 	return it;
 }

 // Memory-pipelined: look up in large batches, getting as many independent
 // cache misses into the pipe as possible before actually doing any of the
 // loads.
 template<int N>
 static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it)
 {
 	const LRMEntry * entries = lrm->entries.data();

 	LRM_hash_t hash[N];
 	size_t nleft[N];
 	size_t nleft_max = 0;

 	for (int i = 0; i < N; i++)
 	{
 		hash[i] = lrm_hash8(ptr + i);
 		LRM_hash_t ji = hash[i] >> lrm->jumpInShift;

 		S32 jump1 = lrm->jumpIn[ji];
 		S32 jump2 = lrm->jumpIn[ji + 1];

 		it[i] = entries + jump1;
 		nleft[i] = jump2 - jump1;
 		nleft_max = RR_MAX(nleft_max, nleft[i]);

 		RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
 	}

 	// Run binary search iterations while at least one search has a
 	// non-trivial range left
 	while (nleft_max > 1)
 	{
 		for (int i = 0; i < N; i++)
 		{
 			size_t half = nleft[i] >> 1;
 			const LRMEntry * cur = it[i];
 			const LRMEntry * mid = cur + half;

 			it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move
 			nleft[i] -= half;

 			RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
 		}

 		nleft_max -= nleft_max >> 1;
 	}

 	// Finalize the searches
 	for (int i = 0; i < N; i++)
 	{
 		// Final step
 		RR_ASSERT(nleft[i] <= 1);
 		it[i] += (it[i]->hash < hash[i]);
 	}
 }

 // Results:
 // SimpleProf         :seconds  calls     count :     clk/call    clk/count      min
 // search_one         : 1.9221      1   8388480 : 5581904066.0       665.42   665.42 <- baseline without PF
 // search_one_pf      : 1.4311      1   8388480 : 4155978905.0       495.44   495.44 <- baseline with PF
 // search_multi8_pf2  : 0.8201      1   8388480 : 2381551257.0       283.91   283.91
 // search_multi16_pf2 : 0.6135      1   8388480 : 1781563475.0       212.38   212.38
 // search_multi32_pf2 : 0.5709      1   8388480 : 1657999650.0       197.65   197.65
 // search_multi64_pf2 : 0.5591      1   8388480 : 1623743724.0       193.57   193.57
 // search_multi128_pf2: 0.5739      1   8388480 : 1666734087.0       198.69   198.69
	// Baseline version without prefetch
	static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr)
	{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
	while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
	{
	const LRMEntry * mid = &it[half];

	// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
	// as quickly as it could but it has the advantage of being a very simple update.
	//
	// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
	// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
	// update rule slightly cheaper, even though it's a bit sloppy.
	it = (mid->hash < hash) ? mid : it; // conditional move
	nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
	}

	// The above loop shrank nleft down to 1. Do the final step.
	RR_ASSERT(nleft == 1);
	it += (it->hash < hash);
	}

	return it;
	}

	// Baseline version with prefetch
	static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr)
	{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
	while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
	{
	const LRMEntry * mid = &it[half];

	// Prefetch both possible paths for the next step
	RR_PREFETCHR_CL(&it[half >> 1]);
	RR_PREFETCHR_CL(&mid[half >> 1]);

	// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
	// as quickly as it could but it has the advantage of being a very simple update.
	//
	// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
	// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
	// update rule slightly cheaper, even though it's a bit sloppy.
	it = (mid->hash < hash) ? mid : it; // conditional move
	nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
	}

	// The above loop shrank nleft down to 1. Do the final step.
	RR_ASSERT(nleft == 1);
	it += (it->hash < hash);
	}

	return it;
	}

	// Memory-pipelined: look up in large batches, getting as many independent
	// cache misses into the pipe as possible before actually doing any of the
	// loads.
	template<int N>
	static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it)
	{
	const LRMEntry * entries = lrm->entries.data();

	LRM_hash_t hash[N];
	size_t nleft[N];
	size_t nleft_max = 0;

	for (int i = 0; i < N; i++)
	{
	hash[i] = lrm_hash8(ptr + i);
	LRM_hash_t ji = hash[i] >> lrm->jumpInShift;

	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	it[i] = entries + jump1;
	nleft[i] = jump2 - jump1;
	nleft_max = RR_MAX(nleft_max, nleft[i]);

	RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
	}

	// Run binary search iterations while at least one search has a
	// non-trivial range left
	while (nleft_max > 1)
	{
	for (int i = 0; i < N; i++)
	{
	size_t half = nleft[i] >> 1;
	const LRMEntry * cur = it[i];
	const LRMEntry * mid = cur + half;

	it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move
	nleft[i] -= half;

	RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
	}

	nleft_max -= nleft_max >> 1;
	}

	// Finalize the searches
	for (int i = 0; i < N; i++)
	{
	// Final step
	RR_ASSERT(nleft[i] <= 1);
	it[i] += (it[i]->hash < hash[i]);
	}
	}

	// Results:
	// SimpleProf :seconds calls count : clk/call clk/count min
	// search_one : 1.9221 1 8388480 : 5581904066.0 665.42 665.42 <- baseline without PF
	// search_one_pf : 1.4311 1 8388480 : 4155978905.0 495.44 495.44 <- baseline with PF
	// search_multi8_pf2 : 0.8201 1 8388480 : 2381551257.0 283.91 283.91
	// search_multi16_pf2 : 0.6135 1 8388480 : 1781563475.0 212.38 212.38
	// search_multi32_pf2 : 0.5709 1 8388480 : 1657999650.0 197.65 197.65
	// search_multi64_pf2 : 0.5591 1 8388480 : 1623743724.0 193.57 193.57
	// search_multi128_pf2: 0.5739 1 8388480 : 1666734087.0 198.69 198.69