buybackoff · August 31, 2024 20:44
diff --git a/Benchmark.md b/Benchmark.md
diff --git a/BinarySearch1.cs b/BinarySearch1.cs
 /// <summary>
 /// Performs classic binary search and returns index of the value or its negative binary complement.
 /// </summary>
 [MethodImpl(MethodImplOptions.AggressiveInlining
 #if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
 #endif
 )]
 [SuppressMessage("ReSharper", "HeapView.BoxingAllocation")] // false warnings for (type)(object)value pattern
 public static int BinarySearch<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
 {
 #if HAS_INTRINSICS
    if (Avx2.IsSupported)
    {
        if (typeof(T) == typeof(sbyte))
            return BinarySearchAvx(ref Unsafe.As<T, sbyte>(ref vecStart), length, (sbyte) (object) value);

        if (typeof(T) == typeof(short))
            return BinarySearchAvx(ref Unsafe.As<T, short>(ref vecStart), length, (short) (object) value);

        if (typeof(T) == typeof(int))
            return BinarySearchAvx(ref Unsafe.As<T, int>(ref vecStart), length, (int) (object) value);

        if (typeof(T) == typeof(long)
            || typeof(T) == typeof(Timestamp)
            || typeof(T) == typeof(DateTime)
        )
            return BinarySearchAvx(ref Unsafe.As<T, long>(ref vecStart), length, (long) (object) value);
    }
 #endif

    // This one is actually very hard to beat in general case
    // because of memory access (cache miss) costs. In the classic
    // algorithm every memory access is useful, i.e. it halves the
    // search space. K-ary search has K-2 useless memory accesses.
    // E.g. for SIMD-ized search with K = 4 we do 4 memory accesses
    // but reduce the search space to the same size as 2 accesses
    // in the classic algorithm. SIMD doesn't speedup memory access,
    // which is the main cost for high number of items.
    return BinarySearchClassic(ref vecStart, length, value, comparer);
 }

 #if HAS_INTRINSICS
 [MethodImpl(MethodImplOptions.AggressiveInlining
 #if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
 #endif
 )]
 internal static int BinarySearchAvx(ref long vecStart, int length, long value)
 {
    unchecked
    {
        int i;
        int c;
        int lo = 0;
        int hi = length - 1;
        var valVec = Vector256.Create(value);
        while (hi - lo > Vector256<long>.Count - 1)
        {
            i = (int) (((uint) hi + (uint) lo) >> 1) - (Vector256<long>.Count >> 1);

            var vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, i)));

            // AVX512 has _mm256_cmpge_epi64_mask that should allow to combine the two operations
            // and avoid edge-case check in `mask == 0` case below
            var gt = Avx2.CompareGreaterThan(valVec, vec); // _mm256_cmpgt_epi64
            var mask = Avx2.MoveMask(gt.AsByte());

            if (mask == 0) // val is smaller than all in vec
            {
                // but could be equal to the first element
                c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));
                if (c == 0)
                {
                    lo = i;
                    goto RETURN;
                }

                hi = i - 1;
            }
            else if (mask == -1) // val is larger than all in vec
            {
                lo = i + Vector256<long>.Count;
            }
            else
            {
                var clz = BitUtil.NumberOfLeadingZeros(mask);
                var index = (32 - clz) / Unsafe.SizeOf<long>();
                lo = i + index;
                c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)));
                goto RETURN;
            }
        }

        while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
               & ++lo <= hi) // if using branchless & then need to correct lo below
        {
        }
        
        // correct back non-short-circuit & evaluation
        lo -= UnsafeEx.Clt(c, 1); // (int)(c < 1)

        RETURN:
        var ceq1 = -UnsafeEx.Ceq(c, 0); // (int)(c == 0)
        return (ceq1 & lo) | (~ceq1 & ~lo);
    }
 }
 #endif

 /// <summary>
 /// Performs classic binary search and returns index of the value or its negative binary complement.
 /// </summary>
 [MethodImpl(MethodImplOptions.AggressiveInlining
 #if HAS_AGGR_OPT
            | MethodImplOptions.AggressiveOptimization
 #endif
 )]
 internal static int BinarySearchClassic<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
 {
    unchecked
    {
        int lo = 0;
        int hi = length - 1;
        // If length == 0, hi == -1, and loop will not be entered
        while (lo <= hi)
        {
            // PERF: `lo` or `hi` will never be negative inside the loop,
            //       so computing median using uints is safe since we know
            //       `length <= int.MaxValue`, and indices are >= 0
            //       and thus cannot overflow an uint.
            //       Saves one subtraction per loop compared to
            //       `int i = lo + ((hi - lo) >> 1);`
            int i = (int) (((uint) hi + (uint) lo) >> 1);

            int c = comparer.Compare(value, UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));

            if (c == 0)
            {
                return i;
            }

            if (c > 0)
            {
                lo = i + 1;
            }
            else
            {
                hi = i - 1;
            }
        }

        // If none found, then a negative number that is the bitwise complement
        // of the index of the next element that is larger than or, if there is
        // no larger element, the bitwise complement of `length`, which
        // is `lo` at this point.
        return ~lo;
    }
 }
diff --git a/BinarySearch2.cs b/BinarySearch2.cs
 #if HAS_INTRINSICS
        [MethodImpl(MethodImplOptions.AggressiveInlining
 #if HAS_AGGR_OPT
                    | MethodImplOptions.AggressiveOptimization
 #endif
        )]
        internal static int BinarySearchAvx2(ref long vecStart, int length, long value)
        {
            unchecked
            {
                int c;
                int lo = 0;
                int hi = length - 1;
                Vector256<long> vec;
                Vector256<long> gt;
                int mask;

                if (hi - lo < Vector256<long>.Count)
                    goto LINEAR;

                var valVec = Vector256.Create(value);
                while (hi - lo >= Vector256<long>.Count * 2)
                {
                    var i = (int) (((uint) hi + (uint) lo - Vector256<long>.Count) >> 1);
                    
                    vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, i)));
                    gt = Avx2.CompareGreaterThan(valVec, vec);
                    mask = Avx2.MoveMask(gt.AsByte());

                    if (mask != -1)
                    {
                        if (mask != 0)
                        {
                            int clz = (int) Lzcnt.LeadingZeroCount((uint) mask);
                            int index = (32 - clz) / Unsafe.SizeOf<long>();
                            lo = i + index;
                            c = value.CompareTo(UnsafeEx.ReadUnaligned<long>(ref Unsafe.Add<long>(ref vecStart, lo)));
                            goto RETURN;
                        }

                        // val is not greater than all in vec
                        // not i-1, i could equal;
                        hi = i;
                    }
                    else
                    {
                        // val is larger than all in vec
                        lo = i + Vector256<long>.Count;
                    }
                }

                {
                    vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, lo)));
                    gt = Avx2.CompareGreaterThan(valVec, vec); // _mm256_cmpgt_epi64
                    mask = Avx2.MoveMask(gt.AsByte());

                    var clz = (int) Lzcnt.LeadingZeroCount((uint) mask);
                    var index = (32 - clz) / Unsafe.SizeOf<long>();
                    lo += index;
                }
                while (mask == -1 & hi - lo >= Vector256<long>.Count) ;

                LINEAR:
                while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
                       && ++lo <= hi)
                {
                }

                RETURN:
                var ceq1 = -UnsafeEx.Ceq(c, 0);
                return (ceq1 & lo) | (~ceq1 & ~lo);
            }
        }

 #endif
diff --git a/Note on branchless code in .NET.md b/Note on branchless code in .NET.md
N	Classic	Avx	Avx+	Avx/Classic	Avx+/Avx	Avx+/Classic
1	569.0	390.3	630.0	-31%	61%	11%
2	537.5	287.0	616.1	-47%	115%	15%
4	286.0	209.5	629.5	-27%	201%	120%
8	185.2	290.4	247.5	57%	-15%	34%
16	120.4	215.3	199.4	79%	-7%	66%
32	99.5	144.2	153.8	45%	7%	55%
64	76.4	119.2	129.5	56%	9%	69%
128	61.2	101.5	111.1	66%	10%	82%
256	50.2	81.0	83.3	61%	3%	66%
512	29.1	43.8	74.8	50%	71%	157%
1024	22.5	31.0	43.7	38%	41%	94%
4096	19.0	23.3	30.7	23%	32%	62%
16384	17.7	20.5	28.3	16%	38%	60%
65536	16.7	19.6	24.6	17%	26%	47%
131072	15.9	19.1	23.1	20%	21%	45%
			Avg	28%	41%	65%
			Min	-47%	-15%	11%
			Max	79%	201%	157%
Case	MOPS	Elapsed
BS_Classic_2	570.38	15 ms
BS_Classic_1	569.90	15 ms
BS_Avx_1	505.60	17 ms
BS_Avx_2	504.03	17 ms
BS_Avx_4	503.77	17 ms
BS_Classic_4	288.84	29 ms
BS_Avx_8	285.34	29 ms
BS_Avx_16	186.61	45 ms
BS_Classic_8	183.58	46 ms
BS_Avx_32	151.42	55 ms
BS_Avx_64	117.92	71 ms
BS_Classic_16	111.73	75 ms
BS_Avx_128	95.59	88 ms
BS_Classic_32	90.19	93 ms
BS_Classic_64	71.43	117 ms
BS_Avx_256	63.05	133 ms
BS_Classic_128	60.40	139 ms
BS_Classic_256	47.07	178 ms
BS_Avx_512	40.02	210 ms
BS_Avx_1024	31.25	268 ms
BS_Classic_512	28.62	293 ms
BS_Avx_4096	22.84	367 ms
BS_Classic_1024	22.35	375 ms
BS_Avx_16384	19.77	424 ms
BS_Classic_4096	19.03	441 ms
BS_Classic_16384	17.71	474 ms
Case	MOPS	Elapsed
BS_Avx+_4	519.18	40 ms
BS_Avx+_1	507.19	41 ms
BS_Avx+_2	498.41	42 ms
BS_Avx_1	372.98	56 ms
BS_Avx_2	283.01	74 ms
BS_Avx_8	260.14	81 ms
BS_Avx+_8	244.84	86 ms
BS_Avx_4	241.00	87 ms
BS_Avx_16	201.39	104 ms
BS_Avx+_16	197.56	106 ms
BS_Avx+_32	157.66	133 ms
BS_Avx_32	145.97	144 ms
BS_Avx+_64	127.95	164 ms
BS_Avx_64	122.45	171 ms
BS_Avx+_128	108.82	193 ms
BS_Avx_128	98.70	212 ms
BS_Avx+_256	89.20	235 ms
BS_Avx_256	81.84	256 ms
BS_Avx+_512	72.26	290 ms
BS_Avx+_1024	44.33	473 ms
BS_Avx_512	36.27	578 ms
BS_Avx+_4096	30.81	681 ms
BS_Avx_1024	28.90	726 ms
BS_Avx+_16384	28.55	735 ms
BS_Avx+_65536	24.31	863 ms
BS_Avx+_131072	22.53	931 ms
BS_Avx_4096	22.53	931 ms
BS_Avx_16384	19.56	1,072 ms
BS_Avx_65536	18.17	1,154 ms
BS_Avx_131072	17.37	1,207 ms
Case	MOPS	Elapsed
BS_Avx+_1	629.99	33 ms
BS_Avx+_4	629.49	33 ms
BS_Avx+_2	616.09	34 ms
BS_Classic_1	569.00	37 ms
BS_Classic_2	537.47	39 ms
BS_Avx_1	390.32	54 ms
BS_Avx_8	290.42	72 ms
BS_Avx_2	286.97	73 ms
BS_Classic_4	286.03	73 ms
BS_Avx+_8	247.45	85 ms
BS_Avx_16	215.32	97 ms
BS_Avx_4	209.48	100 ms
BS_Avx+_16	199.38	105 ms
BS_Classic_8	185.15	113 ms
BS_Avx+_32	153.79	136 ms
BS_Avx_32	144.24	145 ms
BS_Avx+_64	129.52	162 ms
BS_Classic_16	120.41	174 ms
BS_Avx_64	119.15	176 ms
BS_Avx+_128	111.14	189 ms
BS_Avx_128	101.45	207 ms
BS_Classic_32	99.50	211 ms
BS_Avx+_256	83.33	252 ms
BS_Avx_256	81.02	259 ms
BS_Classic_64	76.43	274 ms
BS_Avx+_512	74.81	280 ms
BS_Classic_128	61.20	343 ms
BS_Classic_256	50.18	418 ms
BS_Avx_512	43.84	478 ms
BS_Avx+_1024	43.72	480 ms
BS_Avx_1024	31.00	676 ms
BS_Avx+_4096	30.71	683 ms
BS_Classic_512	29.13	720 ms
BS_Avx+_16384	28.26	742 ms
BS_Avx+_65536	24.60	853 ms
BS_Avx_4096	23.28	901 ms
BS_Avx+_131072	23.08	909 ms
BS_Classic_1024	22.48	933 ms
BS_Avx_16384	20.52	1,022 ms
BS_Avx_65536	19.57	1,072 ms
BS_Avx_131072	19.06	1,100 ms
BS_Classic_4096	18.95	1,106 ms
BS_Classic_16384	17.68	1,186 ms
BS_Classic_65536	16.71	1,255 ms
BS_Classic_131072	15.87	1,321 ms
	/// <summary>
	/// Performs classic binary search and returns index of the value or its negative binary complement.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	[SuppressMessage("ReSharper", "HeapView.BoxingAllocation")] // false warnings for (type)(object)value pattern
	public static int BinarySearch<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
	{
	#if HAS_INTRINSICS
	if (Avx2.IsSupported)
	{
	if (typeof(T) == typeof(sbyte))
	return BinarySearchAvx(ref Unsafe.As<T, sbyte>(ref vecStart), length, (sbyte) (object) value);

	if (typeof(T) == typeof(short))
	return BinarySearchAvx(ref Unsafe.As<T, short>(ref vecStart), length, (short) (object) value);

	if (typeof(T) == typeof(int))
	return BinarySearchAvx(ref Unsafe.As<T, int>(ref vecStart), length, (int) (object) value);

	if (typeof(T) == typeof(long)
	\|\| typeof(T) == typeof(Timestamp)
	\|\| typeof(T) == typeof(DateTime)
	)
	return BinarySearchAvx(ref Unsafe.As<T, long>(ref vecStart), length, (long) (object) value);
	}
	#endif

	// This one is actually very hard to beat in general case
	// because of memory access (cache miss) costs. In the classic
	// algorithm every memory access is useful, i.e. it halves the
	// search space. K-ary search has K-2 useless memory accesses.
	// E.g. for SIMD-ized search with K = 4 we do 4 memory accesses
	// but reduce the search space to the same size as 2 accesses
	// in the classic algorithm. SIMD doesn't speedup memory access,
	// which is the main cost for high number of items.
	return BinarySearchClassic(ref vecStart, length, value, comparer);
	}

	#if HAS_INTRINSICS
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	internal static int BinarySearchAvx(ref long vecStart, int length, long value)
	{
	unchecked
	{
	int i;
	int c;
	int lo = 0;
	int hi = length - 1;
	var valVec = Vector256.Create(value);
	while (hi - lo > Vector256<long>.Count - 1)
	{
	i = (int) (((uint) hi + (uint) lo) >> 1) - (Vector256<long>.Count >> 1);

	var vec = Unsafe.ReadUnaligned<Vector256<long>>(ref Unsafe.As<long, byte>(ref Unsafe.Add(ref vecStart, i)));

	// AVX512 has _mm256_cmpge_epi64_mask that should allow to combine the two operations
	// and avoid edge-case check in `mask == 0` case below
	var gt = Avx2.CompareGreaterThan(valVec, vec); // _mm256_cmpgt_epi64
	var mask = Avx2.MoveMask(gt.AsByte());

	if (mask == 0) // val is smaller than all in vec
	{
	// but could be equal to the first element
	c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));
	if (c == 0)
	{
	lo = i;
	goto RETURN;
	}

	hi = i - 1;
	}
	else if (mask == -1) // val is larger than all in vec
	{
	lo = i + Vector256<long>.Count;
	}
	else
	{
	var clz = BitUtil.NumberOfLeadingZeros(mask);
	var index = (32 - clz) / Unsafe.SizeOf<long>();
	lo = i + index;
	c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)));
	goto RETURN;
	}
	}

	while ((c = value.CompareTo(UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, lo)))) > 0
	& ++lo <= hi) // if using branchless & then need to correct lo below
	{
	}

	// correct back non-short-circuit & evaluation
	lo -= UnsafeEx.Clt(c, 1); // (int)(c < 1)

	RETURN:
	var ceq1 = -UnsafeEx.Ceq(c, 0); // (int)(c == 0)
	return (ceq1 & lo) \| (~ceq1 & ~lo);
	}
	}
	#endif

	/// <summary>
	/// Performs classic binary search and returns index of the value or its negative binary complement.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining
	#if HAS_AGGR_OPT
	\| MethodImplOptions.AggressiveOptimization
	#endif
	)]
	internal static int BinarySearchClassic<T>(ref T vecStart, int length, T value, KeyComparer<T> comparer = default)
	{
	unchecked
	{
	int lo = 0;
	int hi = length - 1;
	// If length == 0, hi == -1, and loop will not be entered
	while (lo <= hi)
	{
	// PERF: `lo` or `hi` will never be negative inside the loop,
	// so computing median using uints is safe since we know
	// `length <= int.MaxValue`, and indices are >= 0
	// and thus cannot overflow an uint.
	// Saves one subtraction per loop compared to
	// `int i = lo + ((hi - lo) >> 1);`
	int i = (int) (((uint) hi + (uint) lo) >> 1);

	int c = comparer.Compare(value, UnsafeEx.ReadUnaligned(ref Unsafe.Add(ref vecStart, i)));

	if (c == 0)
	{
	return i;
	}

	if (c > 0)
	{
	lo = i + 1;
	}
	else
	{
	hi = i - 1;
	}
	}

	// If none found, then a negative number that is the bitwise complement
	// of the index of the next element that is larger than or, if there is
	// no larger element, the bitwise complement of `length`, which
	// is `lo` at this point.
	return ~lo;
	}
	}