gfoidl · August 23, 2023 19:08 · gfoidl · Sep 25, 2022
diff --git a/Vector_Reduce_Min_Max.cs b/Vector_Reduce_Min_Max.cs
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;

 internal static class VectorExtensions
 {
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static T ReduceMin<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T>
    {
        if (Sse41.IsSupported)
        {
            if (typeof(T) == typeof(byte))
            {
                // We treat the vector as short and shift right by 8 bits. Thus we get
                // v0:      <b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15>
                // shifted: <b1,  0, b3,  0, b5,  0, b7,  0, b9,  0, b11,   0, b13,   0, b15,   0>
                // Then when treated as vector of byte again, we can build 8 pairwise
                // minimums.
                // With SSE4.1's _mm_minpos_epu16 we get the final minimum.
                // As all values are within the byte-range, and can't exceed it, the 
                // cast from ushort -> byte is safe.

                Vector128<byte> v0       = vector.AsByte();
                Vector128<byte> shifted  = Sse2.ShiftRightLogical(v0.AsInt16(), 8).AsByte();
                Vector128<byte> tmp      = Sse2.Min(v0, shifted);
                Vector128<ushort> minTmp = Sse41.MinHorizontal(tmp.AsUInt16());
                return (T)(object)(byte)minTmp.ToScalar();
            }
            else if (typeof(T) == typeof(sbyte))
            {
                // We transform the values so that sbyte.MinValue -> 0, sbyte.MaxValue -> byte.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of byte-vector (above).
                // The transformation needs to be undone before returning the result.
                Vector128<byte> v0 = (vector.AsSByte() - Vector128.Create(sbyte.MinValue)).AsByte();
                byte minTmp        = ReduceMin(v0);
                return (T)(object)(sbyte)(minTmp - sbyte.MinValue);
            }
            else if (typeof(T) == typeof(ushort))
            {
                Vector128<ushort> minTmp = Sse41.MinHorizontal(vector.AsUInt16());
                return (T)(object)minTmp.ToScalar();
            }
            else if (typeof(T) == typeof(short))
            {
                // We transformm the values so that short.MinValue -> 0, short.MaxValue -> ushort.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
                // The transformation needs to be undone before returning the result.
                Vector128<ushort> v0 = (vector.AsInt16() - Vector128.Create(short.MinValue)).AsUInt16();
                ushort minTmp        = ReduceMin(v0);
                return (T)(object)(short)(minTmp - short.MinValue);
            }
            else if (typeof(T) == typeof(uint))
            {
                Vector128<uint> v0   = vector.AsUInt32();
                Vector128<uint> v1   = Sse2.Shuffle(v0, 0b_11_11_01_01);
                // https://github.com/dotnet/runtime/issues/75892
                Vector128<uint> val0 = Sse41.Min(v0, v1);
                Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

                return (T)(object)Sse41.Min(val0, val1).ToScalar();
            }
            else if (typeof(T) == typeof(int))
            {
                Vector128<int> v0   = vector.AsInt32();
                Vector128<int> v1   = Sse2.Shuffle(v0, 0b_11_11_01_01);
                // https://github.com/dotnet/runtime/issues/75892
                Vector128<int> val0 = Sse41.Min(v0, v1);
                Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

                return (T)(object)Sse41.Min(val0, val1).ToScalar();
            }
            // These are not worth it.
            //else if (typeof(T) == typeof(ulong))
            //{
            //    Vector128<ulong> v0 = vector.AsUInt64();
            //    Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong));

            //    return (T)(object)Vector128.Min(v0, v1).ToScalar();
            //}
            //else if (typeof(T) == typeof(long))
            //{
            //    Vector128<long> v0 = vector.AsInt64();
            //    Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long));

            //    return (T)(object)Vector128.Min(v0, v1).ToScalar();
            //}
        }

        T min = vector[0];
        for (int i = 1; i < Vector128<T>.Count; i++)    // loop gets unrolled if count <= 4
        {
            if (vector[i] < min)
            {
                min = vector[i];
            }
        }

        return min;
    }
    //-------------------------------------------------------------------------
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static T ReduceMax<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T>
    {
        if (Sse41.IsSupported)
        {
            if (typeof(T) == typeof(byte))
            {
                // We transform the values so that byte.MaxValue -> 0, byte.MinValue -> byte.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of byte-vector (above).
                // The transformation needs to be undone before returning the result.
                Vector128<byte> v0 = ~vector.AsByte();
                byte minTmp        = ReduceMin(v0);
                return (T)(object)(byte)~minTmp;
            }
            else if (typeof(T) == typeof(sbyte))
            {
                // We transform the values so that sbyte.MaxValue -> 0, sbyte.MinValue -> byte.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of byte-vector (above).
                // The transformation needs to be undone before returning the result.
                Vector128<byte> v0 = (Vector128.Create(sbyte.MaxValue) - vector.AsSByte()).AsByte();
                byte minTmp        = ReduceMin(v0);
                return (T)(object)(sbyte)(sbyte.MaxValue - minTmp);
            }
            else if (typeof(T) == typeof(ushort))
            {
                // We transform the values so that ushort.MaxValue -> 0, ushort.MinValue -> ushort.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
                // The transformation needs to be undone before returning the result.
                ushort minTmp = ReduceMin(~vector.AsUInt16());
                return (T)(object)(ushort)~minTmp;
            }
            else if (typeof(T) == typeof(short))
            {
                // We transform the values so that short.MaxValue -> 0, short.MinValue -> ushort.MaxValue.
                // Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
                // The transformation needs to be undone before returning the result.
                Vector128<ushort> v0 = (Vector128.Create(short.MaxValue) - vector.AsInt16()).AsUInt16();
                ushort minTmp        = ReduceMin(v0);
                return (T)(object)(short)(short.MaxValue - minTmp);
            }
            else if (typeof(T) == typeof(uint))
            {
                Vector128<uint> v0   = vector.AsUInt32();
                Vector128<uint> v1   = Sse2.Shuffle(v0, 0b_11_11_01_01);
                // https://github.com/dotnet/runtime/issues/75892
                Vector128<uint> val0 = Sse41.Max(v0, v1);
                Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

                return (T)(object)Sse41.Max(val0, val1).ToScalar();
            }
            else if (typeof(T) == typeof(int))
            {
                Vector128<int> v0   = vector.AsInt32();
                Vector128<int> v1   = Sse2.Shuffle(v0, 0b_11_11_01_01);
                // https://github.com/dotnet/runtime/issues/75892
                Vector128<int> val0 = Sse41.Max(v0, v1);
                Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

                return (T)(object)Sse41.Max(val0, val1).ToScalar();
            }
            // These are not worth it.
            //else if (typeof(T) == typeof(ulong))
            //{
            //    Vector128<ulong> v0 = vector.AsUInt64();
            //    Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong));

            //    return (T)(object)Vector128.Max(v0, v1).ToScalar();
            //}
            //else if (typeof(T) == typeof(long))
            //{
            //    Vector128<long> v0 = vector.AsInt64();
            //    Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long));

            //    return (T)(object)Vector128.Max(v0, v1).ToScalar();
            //}
        }

        T max = vector[0];
        for (int i = 1; i < Vector128<T>.Count; i++)    // loop gets unrolled if count <= 4
        {
            if (vector[i] > max)
            {
                max = vector[i];
            }
        }

        return max;
    }
    //-------------------------------------------------------------------------
    public static T ReduceMin<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T>
    {
        // TODO: implement.
        // So far I didn't think much about Vector256 here.
        // Simple way would be to operate on the two lanes, then combine lower and upper.

        throw new NotImplementedException();
    }
    //-------------------------------------------------------------------------
    public static T ReduceMax<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T>
    {
        // TODO: implement.
        // So far I didn't think much about Vector256 here.
        // Simple way would be to operate on the two lanes, then combine lower and upper.

        throw new NotImplementedException();
    }
 }
	using System.Numerics;
	using System.Runtime.CompilerServices;
	using System.Runtime.Intrinsics;
	using System.Runtime.Intrinsics.X86;

	internal static class VectorExtensions
	{
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public static T ReduceMin<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T>
	{
	if (Sse41.IsSupported)
	{
	if (typeof(T) == typeof(byte))
	{
	// We treat the vector as short and shift right by 8 bits. Thus we get
	// v0: <b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15>
	// shifted: <b1, 0, b3, 0, b5, 0, b7, 0, b9, 0, b11, 0, b13, 0, b15, 0>
	// Then when treated as vector of byte again, we can build 8 pairwise
	// minimums.
	// With SSE4.1's _mm_minpos_epu16 we get the final minimum.
	// As all values are within the byte-range, and can't exceed it, the
	// cast from ushort -> byte is safe.

	Vector128<byte> v0 = vector.AsByte();
	Vector128<byte> shifted = Sse2.ShiftRightLogical(v0.AsInt16(), 8).AsByte();
	Vector128<byte> tmp = Sse2.Min(v0, shifted);
	Vector128<ushort> minTmp = Sse41.MinHorizontal(tmp.AsUInt16());
	return (T)(object)(byte)minTmp.ToScalar();
	}
	else if (typeof(T) == typeof(sbyte))
	{
	// We transform the values so that sbyte.MinValue -> 0, sbyte.MaxValue -> byte.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of byte-vector (above).
	// The transformation needs to be undone before returning the result.
	Vector128<byte> v0 = (vector.AsSByte() - Vector128.Create(sbyte.MinValue)).AsByte();
	byte minTmp = ReduceMin(v0);
	return (T)(object)(sbyte)(minTmp - sbyte.MinValue);
	}
	else if (typeof(T) == typeof(ushort))
	{
	Vector128<ushort> minTmp = Sse41.MinHorizontal(vector.AsUInt16());
	return (T)(object)minTmp.ToScalar();
	}
	else if (typeof(T) == typeof(short))
	{
	// We transformm the values so that short.MinValue -> 0, short.MaxValue -> ushort.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
	// The transformation needs to be undone before returning the result.
	Vector128<ushort> v0 = (vector.AsInt16() - Vector128.Create(short.MinValue)).AsUInt16();
	ushort minTmp = ReduceMin(v0);
	return (T)(object)(short)(minTmp - short.MinValue);
	}
	else if (typeof(T) == typeof(uint))
	{
	Vector128<uint> v0 = vector.AsUInt32();
	Vector128<uint> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01);
	// https://github.com/dotnet/runtime/issues/75892
	Vector128<uint> val0 = Sse41.Min(v0, v1);
	Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

	return (T)(object)Sse41.Min(val0, val1).ToScalar();
	}
	else if (typeof(T) == typeof(int))
	{
	Vector128<int> v0 = vector.AsInt32();
	Vector128<int> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01);
	// https://github.com/dotnet/runtime/issues/75892
	Vector128<int> val0 = Sse41.Min(v0, v1);
	Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

	return (T)(object)Sse41.Min(val0, val1).ToScalar();
	}
	// These are not worth it.
	//else if (typeof(T) == typeof(ulong))
	//{
	// Vector128<ulong> v0 = vector.AsUInt64();
	// Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong));

	// return (T)(object)Vector128.Min(v0, v1).ToScalar();
	//}
	//else if (typeof(T) == typeof(long))
	//{
	// Vector128<long> v0 = vector.AsInt64();
	// Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long));

	// return (T)(object)Vector128.Min(v0, v1).ToScalar();
	//}
	}

	T min = vector[0];
	for (int i = 1; i < Vector128<T>.Count; i++) // loop gets unrolled if count <= 4
	{
	if (vector[i] < min)
	{
	min = vector[i];
	}
	}

	return min;
	}
	//-------------------------------------------------------------------------
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	public static T ReduceMax<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T>
	{
	if (Sse41.IsSupported)
	{
	if (typeof(T) == typeof(byte))
	{
	// We transform the values so that byte.MaxValue -> 0, byte.MinValue -> byte.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of byte-vector (above).
	// The transformation needs to be undone before returning the result.
	Vector128<byte> v0 = ~vector.AsByte();
	byte minTmp = ReduceMin(v0);
	return (T)(object)(byte)~minTmp;
	}
	else if (typeof(T) == typeof(sbyte))
	{
	// We transform the values so that sbyte.MaxValue -> 0, sbyte.MinValue -> byte.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of byte-vector (above).
	// The transformation needs to be undone before returning the result.
	Vector128<byte> v0 = (Vector128.Create(sbyte.MaxValue) - vector.AsSByte()).AsByte();
	byte minTmp = ReduceMin(v0);
	return (T)(object)(sbyte)(sbyte.MaxValue - minTmp);
	}
	else if (typeof(T) == typeof(ushort))
	{
	// We transform the values so that ushort.MaxValue -> 0, ushort.MinValue -> ushort.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
	// The transformation needs to be undone before returning the result.
	ushort minTmp = ReduceMin(~vector.AsUInt16());
	return (T)(object)(ushort)~minTmp;
	}
	else if (typeof(T) == typeof(short))
	{
	// We transform the values so that short.MaxValue -> 0, short.MinValue -> ushort.MaxValue.
	// Thus we reduced the problem to be the same as the minimum of ushort-vector (above).
	// The transformation needs to be undone before returning the result.
	Vector128<ushort> v0 = (Vector128.Create(short.MaxValue) - vector.AsInt16()).AsUInt16();
	ushort minTmp = ReduceMin(v0);
	return (T)(object)(short)(short.MaxValue - minTmp);
	}
	else if (typeof(T) == typeof(uint))
	{
	Vector128<uint> v0 = vector.AsUInt32();
	Vector128<uint> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01);
	// https://github.com/dotnet/runtime/issues/75892
	Vector128<uint> val0 = Sse41.Max(v0, v1);
	Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

	return (T)(object)Sse41.Max(val0, val1).ToScalar();
	}
	else if (typeof(T) == typeof(int))
	{
	Vector128<int> v0 = vector.AsInt32();
	Vector128<int> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01);
	// https://github.com/dotnet/runtime/issues/75892
	Vector128<int> val0 = Sse41.Max(v0, v1);
	Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10);

	return (T)(object)Sse41.Max(val0, val1).ToScalar();
	}
	// These are not worth it.
	//else if (typeof(T) == typeof(ulong))
	//{
	// Vector128<ulong> v0 = vector.AsUInt64();
	// Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong));

	// return (T)(object)Vector128.Max(v0, v1).ToScalar();
	//}
	//else if (typeof(T) == typeof(long))
	//{
	// Vector128<long> v0 = vector.AsInt64();
	// Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long));

	// return (T)(object)Vector128.Max(v0, v1).ToScalar();
	//}
	}

	T max = vector[0];
	for (int i = 1; i < Vector128<T>.Count; i++) // loop gets unrolled if count <= 4
	{
	if (vector[i] > max)
	{
	max = vector[i];
	}
	}

	return max;
	}
	//-------------------------------------------------------------------------
	public static T ReduceMin<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T>
	{
	// TODO: implement.
	// So far I didn't think much about Vector256 here.
	// Simple way would be to operate on the two lanes, then combine lower and upper.

	throw new NotImplementedException();
	}
	//-------------------------------------------------------------------------
	public static T ReduceMax<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T>
	{
	// TODO: implement.
	// So far I didn't think much about Vector256 here.
	// Simple way would be to operate on the two lanes, then combine lower and upper.

	throw new NotImplementedException();
	}
	}