Last active
August 23, 2023 19:08
-
-
Save gfoidl/b3da2a1bacbd617c04584b2398cedc6e to your computer and use it in GitHub Desktop.
ReduceMin/Max for Vector128<T> where T : IBinaryInteger
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.X86; | |
internal static class VectorExtensions | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public static T ReduceMin<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T> | |
{ | |
if (Sse41.IsSupported) | |
{ | |
if (typeof(T) == typeof(byte)) | |
{ | |
// We treat the vector as short and shift right by 8 bits. Thus we get | |
// v0: <b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15> | |
// shifted: <b1, 0, b3, 0, b5, 0, b7, 0, b9, 0, b11, 0, b13, 0, b15, 0> | |
// Then when treated as vector of byte again, we can build 8 pairwise | |
// minimums. | |
// With SSE4.1's _mm_minpos_epu16 we get the final minimum. | |
// As all values are within the byte-range, and can't exceed it, the | |
// cast from ushort -> byte is safe. | |
Vector128<byte> v0 = vector.AsByte(); | |
Vector128<byte> shifted = Sse2.ShiftRightLogical(v0.AsInt16(), 8).AsByte(); | |
Vector128<byte> tmp = Sse2.Min(v0, shifted); | |
Vector128<ushort> minTmp = Sse41.MinHorizontal(tmp.AsUInt16()); | |
return (T)(object)(byte)minTmp.ToScalar(); | |
} | |
else if (typeof(T) == typeof(sbyte)) | |
{ | |
// We transform the values so that sbyte.MinValue -> 0, sbyte.MaxValue -> byte.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of byte-vector (above). | |
// The transformation needs to be undone before returning the result. | |
Vector128<byte> v0 = (vector.AsSByte() - Vector128.Create(sbyte.MinValue)).AsByte(); | |
byte minTmp = ReduceMin(v0); | |
return (T)(object)(sbyte)(minTmp - sbyte.MinValue); | |
} | |
else if (typeof(T) == typeof(ushort)) | |
{ | |
Vector128<ushort> minTmp = Sse41.MinHorizontal(vector.AsUInt16()); | |
return (T)(object)minTmp.ToScalar(); | |
} | |
else if (typeof(T) == typeof(short)) | |
{ | |
// We transformm the values so that short.MinValue -> 0, short.MaxValue -> ushort.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of ushort-vector (above). | |
// The transformation needs to be undone before returning the result. | |
Vector128<ushort> v0 = (vector.AsInt16() - Vector128.Create(short.MinValue)).AsUInt16(); | |
ushort minTmp = ReduceMin(v0); | |
return (T)(object)(short)(minTmp - short.MinValue); | |
} | |
else if (typeof(T) == typeof(uint)) | |
{ | |
Vector128<uint> v0 = vector.AsUInt32(); | |
Vector128<uint> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01); | |
// https://github.com/dotnet/runtime/issues/75892 | |
Vector128<uint> val0 = Sse41.Min(v0, v1); | |
Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10); | |
return (T)(object)Sse41.Min(val0, val1).ToScalar(); | |
} | |
else if (typeof(T) == typeof(int)) | |
{ | |
Vector128<int> v0 = vector.AsInt32(); | |
Vector128<int> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01); | |
// https://github.com/dotnet/runtime/issues/75892 | |
Vector128<int> val0 = Sse41.Min(v0, v1); | |
Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10); | |
return (T)(object)Sse41.Min(val0, val1).ToScalar(); | |
} | |
// These are not worth it. | |
//else if (typeof(T) == typeof(ulong)) | |
//{ | |
// Vector128<ulong> v0 = vector.AsUInt64(); | |
// Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong)); | |
// return (T)(object)Vector128.Min(v0, v1).ToScalar(); | |
//} | |
//else if (typeof(T) == typeof(long)) | |
//{ | |
// Vector128<long> v0 = vector.AsInt64(); | |
// Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long)); | |
// return (T)(object)Vector128.Min(v0, v1).ToScalar(); | |
//} | |
} | |
T min = vector[0]; | |
for (int i = 1; i < Vector128<T>.Count; i++) // loop gets unrolled if count <= 4 | |
{ | |
if (vector[i] < min) | |
{ | |
min = vector[i]; | |
} | |
} | |
return min; | |
} | |
//------------------------------------------------------------------------- | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public static T ReduceMax<T>(this Vector128<T> vector) where T : struct, IBinaryInteger<T> | |
{ | |
if (Sse41.IsSupported) | |
{ | |
if (typeof(T) == typeof(byte)) | |
{ | |
// We transform the values so that byte.MaxValue -> 0, byte.MinValue -> byte.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of byte-vector (above). | |
// The transformation needs to be undone before returning the result. | |
Vector128<byte> v0 = ~vector.AsByte(); | |
byte minTmp = ReduceMin(v0); | |
return (T)(object)(byte)~minTmp; | |
} | |
else if (typeof(T) == typeof(sbyte)) | |
{ | |
// We transform the values so that sbyte.MaxValue -> 0, sbyte.MinValue -> byte.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of byte-vector (above). | |
// The transformation needs to be undone before returning the result. | |
Vector128<byte> v0 = (Vector128.Create(sbyte.MaxValue) - vector.AsSByte()).AsByte(); | |
byte minTmp = ReduceMin(v0); | |
return (T)(object)(sbyte)(sbyte.MaxValue - minTmp); | |
} | |
else if (typeof(T) == typeof(ushort)) | |
{ | |
// We transform the values so that ushort.MaxValue -> 0, ushort.MinValue -> ushort.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of ushort-vector (above). | |
// The transformation needs to be undone before returning the result. | |
ushort minTmp = ReduceMin(~vector.AsUInt16()); | |
return (T)(object)(ushort)~minTmp; | |
} | |
else if (typeof(T) == typeof(short)) | |
{ | |
// We transform the values so that short.MaxValue -> 0, short.MinValue -> ushort.MaxValue. | |
// Thus we reduced the problem to be the same as the minimum of ushort-vector (above). | |
// The transformation needs to be undone before returning the result. | |
Vector128<ushort> v0 = (Vector128.Create(short.MaxValue) - vector.AsInt16()).AsUInt16(); | |
ushort minTmp = ReduceMin(v0); | |
return (T)(object)(short)(short.MaxValue - minTmp); | |
} | |
else if (typeof(T) == typeof(uint)) | |
{ | |
Vector128<uint> v0 = vector.AsUInt32(); | |
Vector128<uint> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01); | |
// https://github.com/dotnet/runtime/issues/75892 | |
Vector128<uint> val0 = Sse41.Max(v0, v1); | |
Vector128<uint> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10); | |
return (T)(object)Sse41.Max(val0, val1).ToScalar(); | |
} | |
else if (typeof(T) == typeof(int)) | |
{ | |
Vector128<int> v0 = vector.AsInt32(); | |
Vector128<int> v1 = Sse2.Shuffle(v0, 0b_11_11_01_01); | |
// https://github.com/dotnet/runtime/issues/75892 | |
Vector128<int> val0 = Sse41.Max(v0, v1); | |
Vector128<int> val1 = Sse2.Shuffle(val0, 0b_11_10_01_10); | |
return (T)(object)Sse41.Max(val0, val1).ToScalar(); | |
} | |
// These are not worth it. | |
//else if (typeof(T) == typeof(ulong)) | |
//{ | |
// Vector128<ulong> v0 = vector.AsUInt64(); | |
// Vector128<ulong> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(ulong)); | |
// return (T)(object)Vector128.Max(v0, v1).ToScalar(); | |
//} | |
//else if (typeof(T) == typeof(long)) | |
//{ | |
// Vector128<long> v0 = vector.AsInt64(); | |
// Vector128<long> v1 = Sse2.ShiftRightLogical128BitLane(v0, sizeof(long)); | |
// return (T)(object)Vector128.Max(v0, v1).ToScalar(); | |
//} | |
} | |
T max = vector[0]; | |
for (int i = 1; i < Vector128<T>.Count; i++) // loop gets unrolled if count <= 4 | |
{ | |
if (vector[i] > max) | |
{ | |
max = vector[i]; | |
} | |
} | |
return max; | |
} | |
//------------------------------------------------------------------------- | |
public static T ReduceMin<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T> | |
{ | |
// TODO: implement. | |
// So far I didn't think much about Vector256 here. | |
// Simple way would be to operate on the two lanes, then combine lower and upper. | |
throw new NotImplementedException(); | |
} | |
//------------------------------------------------------------------------- | |
public static T ReduceMax<T>(this Vector256<T> vector) where T : struct, IBinaryInteger<T> | |
{ | |
// TODO: implement. | |
// So far I didn't think much about Vector256 here. | |
// Simple way would be to operate on the two lanes, then combine lower and upper. | |
throw new NotImplementedException(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The vectorized reduction is
E.g. for
byte
:or
int
:Program.cs