Last active
December 22, 2021 11:07
-
-
Save EgorBo/1d059726dae285e3a1db501896e8a1bd to your computer and use it in GitHub Desktop.
Faster_SpanHelpers_IndexOf.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Diagnostics; | |
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
using System.Runtime.Intrinsics.X86; | |
using System.Text; | |
using BenchmarkDotNet.Attributes; | |
public class P | |
{ | |
static byte[] data1 = { 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 }; | |
static byte[] data2 = { 72, 111, 115, 116, 58, 32, 49, 48, 46, 48, 46, 48, 46, 49, 48, 50, 58, 53, 48, 48, 48, 13, 10, 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 }; | |
static byte[] data3 = { 65, 99, 99, 101, 112, 116, 58, 32, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 106, 115, 111, 110, 44, 116, 101, 120, 116, 47, 104, 116, 109, 108, 59, 113, 61, 48, 46, 57, 44, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 120, 104, 116, 109, 108, 43, 120, 109, 108, 59, 113, 61, 48, 46, 57, 44, 97, 112, 112, 108, 105, 99, 97, 116, 105, 111, 110, 47, 120, 109, 108, 59, 113, 61, 48, 46, 56, 44, 42, 47, 42, 59, 113, 61, 48, 46, 55, 13, 10, 72, 111, 115, 116, 58, 32, 49, 48, 46, 48, 46, 48, 46, 49, 48, 50, 58, 53, 48, 48, 48, 13, 10, 67, 111, 110, 110, 101, 99, 116, 105, 111, 110, 58, 32, 107, 101, 101, 112, 45, 97, 108, 105, 118, 101, 13, 10, 13, 10 }; | |
static void Main() | |
{ | |
var str1 = Encoding.UTF8.GetString(data1); | |
var str2 = Encoding.UTF8.GetString(data2); | |
var str3 = Encoding.UTF8.GetString(data3); | |
var p = new P(); | |
Debug.Assert(p.BCL_data1() == p.Egor_data1()); | |
Debug.Assert(p.BCL_data2() == p.Egor_data2()); | |
Debug.Assert(p.BCL_data3() == p.Egor_data3()); | |
BenchmarkDotNet.Running.BenchmarkRunner.Run<P>(); | |
} | |
[Benchmark] | |
public int BCL_data1() | |
{ | |
var d = data1; | |
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
[Benchmark] | |
public int BCL_data2() | |
{ | |
var d = data2; | |
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
[Benchmark] | |
public int BCL_data3() | |
{ | |
var d = data3; | |
return SpanHelpers.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
[Benchmark] | |
public int Egor_data1() | |
{ | |
var d = data1; | |
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
[Benchmark] | |
public int Egor_data2() | |
{ | |
var d = data2; | |
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
[Benchmark] | |
public int Egor_data3() | |
{ | |
var d = data3; | |
return SpanHelpers_Egor.IndexOfAny(ref d[0], (byte)'\r', (byte)'\n', d.Length); | |
} | |
} | |
// My changes: | |
public static class SpanHelpers_Egor | |
{ | |
public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, nint length) | |
{ | |
nuint ulen = (nuint)length; | |
if (ulen >= 64) | |
goto DOUBLE_AVX; | |
if (ulen >= 32) | |
goto AVX; | |
if (ulen >= 16) | |
goto SSE; | |
for (nuint i = 0; i < ulen; i++) | |
{ | |
byte x = Unsafe.AddByteOffset(ref searchSpace, i); | |
if (x == value0 || x == value1) | |
return (int)i; | |
} | |
return -1; | |
SSE: | |
{ | |
var vec0 = Vector128.Create(value0); | |
var vec1 = Vector128.Create(value1); | |
nuint offset = 0; | |
ref byte b = ref searchSpace; | |
while (offset < ulen) | |
{ | |
var vector = Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref b, offset)); | |
var matches = Sse2.MoveMask( | |
Sse2.Or( | |
Sse2.CompareEqual(vec0, vector), | |
Sse2.CompareEqual(vec1, vector)) | |
.AsByte()); | |
if (matches == 0) | |
{ | |
offset += (nuint)Vector128<byte>.Count; | |
if (offset > ulen) | |
offset = ulen - (nuint)Vector128<byte>.Count; | |
continue; | |
} | |
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches)); | |
} | |
return -1; | |
} | |
AVX: | |
{ | |
var vec0 = Vector256.Create(value0); | |
var vec1 = Vector256.Create(value1); | |
nuint offset = 0; | |
ref byte b = ref searchSpace; | |
while (offset < ulen) | |
{ | |
var vector = Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref b, offset)); | |
var matches = Avx2.MoveMask( | |
Avx2.Or( | |
Avx2.CompareEqual(vec0, vector), | |
Avx2.CompareEqual(vec1, vector)) | |
.AsByte()); | |
if (matches == 0) | |
{ | |
offset += (nuint)Vector256<byte>.Count; | |
if (offset > ulen) | |
offset = ulen - (nuint)Vector256<byte>.Count; | |
continue; | |
} | |
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches)); | |
} | |
return -1; | |
} | |
DOUBLE_AVX: | |
{ | |
var vec0 = Vector256.Create(value0); | |
var vec1 = Vector256.Create(value1); | |
nuint offset = 0; | |
ref byte b = ref searchSpace; | |
while (offset < ulen) | |
{ | |
Vector256<byte> vector1 = Unsafe.ReadUnaligned<Vector256<byte>>( | |
ref Unsafe.AddByteOffset(ref b, offset)); | |
Vector256<byte> vector2 = Unsafe.ReadUnaligned<Vector256<byte>>( | |
ref Unsafe.AddByteOffset(ref b, offset + (nuint)Vector256<byte>.Count)); | |
Vector256<byte> cmp1 = Avx2.CompareEqual(vec0, vector1); | |
Vector256<byte> cmp2 = Avx2.CompareEqual(vec1, vector1); | |
Vector256<byte> cmp3 = Avx2.CompareEqual(vec0, vector2); | |
Vector256<byte> cmp4 = Avx2.CompareEqual(vec1, vector2); | |
Vector256<byte> or1 = Avx2.Or(cmp1, cmp2).AsByte(); | |
Vector256<byte> or2 = Avx2.Or(cmp3, cmp4).AsByte(); | |
int matches1 = Avx2.MoveMask(or1); | |
int matches2 = Avx2.MoveMask(or2); | |
if ((matches1 | matches2) == 0) | |
{ | |
offset += (nuint)Vector256<byte>.Count * 2; | |
if (offset > ulen) | |
// no problem, next chunk will overlap with this one | |
offset = ulen - ((nuint)Vector256<byte>.Count * 2); | |
continue; | |
} | |
if (matches1 != 0) | |
{ | |
return (int)(offset + (nuint)BitOperations.TrailingZeroCount(matches1)); | |
} | |
return (int)(offset + (nuint)Vector256<byte>.Count + (nuint)BitOperations.TrailingZeroCount(matches2)); | |
} | |
} | |
return -1; | |
} | |
} | |
// | |
// | |
// Extracted from SpanHelpers.byte.cs (100%) | |
// | |
// | |
public static class SpanHelpers | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector<byte> LoadVector(ref byte start, nuint offset) | |
=> Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector256<byte> LoadVector256(ref byte start, nuint offset) | |
=> Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static Vector128<byte> LoadVector128(ref byte start, nuint offset) | |
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
// Vector sub-search adapted from https://github.com/aspnet/KestrelHttpServer/pull/1138 | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static int LocateFirstFoundByte(Vector<byte> match) | |
{ | |
var vector64 = Vector.AsVectorUInt64(match); | |
ulong candidate = 0; | |
int i = 0; | |
// Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001 | |
for (; i < Vector<ulong>.Count; i++) | |
{ | |
candidate = vector64[i]; | |
if (candidate != 0) | |
{ | |
break; | |
} | |
} | |
// Single LEA instruction with jitted const (using function result) | |
return i * 8 + LocateFirstFoundByte(candidate); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static int LocateFirstFoundByte(ulong match) | |
=> BitOperations.TrailingZeroCount(match) >> 3; | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static bool TryFindFirstMatchedLane(Vector128<byte> mask, Vector128<byte> compareResult, ref int matchedLane) | |
{ | |
Debug.Assert(AdvSimd.Arm64.IsSupported); | |
// Find the first lane that is set inside compareResult. | |
Vector128<byte> maskedSelectedLanes = AdvSimd.And(compareResult, mask); | |
Vector128<byte> pairwiseSelectedLane = AdvSimd.Arm64.AddPairwise(maskedSelectedLanes, maskedSelectedLanes); | |
ulong selectedLanes = pairwiseSelectedLane.AsUInt64().ToScalar(); | |
if (selectedLanes == 0) | |
{ | |
// all lanes are zero, so nothing matched. | |
return false; | |
} | |
// Find the first lane that is set inside compareResult. | |
matchedLane = BitOperations.TrailingZeroCount(selectedLanes) >> 2; | |
return true; | |
} | |
public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int length) | |
{ | |
Debug.Assert(length >= 0); | |
uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions | |
uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions | |
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations | |
nuint lengthToExamine = (nuint)(uint)length; | |
if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) | |
{ | |
// Avx2 branch also operates on Sse2 sizes, so check is combined. | |
nint vectorDiff = (nint)length - Vector128<byte>.Count; | |
if (vectorDiff >= 0) | |
{ | |
// >= Sse2 intrinsics are supported, and length is enough to use them so use that path. | |
// We jump forward to the intrinsics at the end of the method so a naive branch predict | |
// will choose the non-intrinsic path so short lengths which don't gain anything aren't | |
// overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths | |
// more than make this back from the intrinsics. | |
lengthToExamine = (nuint)vectorDiff; | |
goto IntrinsicsCompare; | |
} | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
// Calculate lengthToExamine here for test, as it is used later | |
nint vectorDiff = (nint)length - Vector<byte>.Count; | |
if (vectorDiff >= 0) | |
{ | |
// Similar as above for Vector version | |
lengthToExamine = (nuint)vectorDiff; | |
goto IntrinsicsCompare; | |
} | |
} | |
uint lookUp; | |
while (lengthToExamine >= 8) | |
{ | |
lengthToExamine -= 8; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found1; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found2; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found3; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found4; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found5; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found6; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found7; | |
offset += 8; | |
} | |
if (lengthToExamine >= 4) | |
{ | |
lengthToExamine -= 4; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found1; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found2; | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found3; | |
offset += 4; | |
} | |
while (lengthToExamine > 0) | |
{ | |
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); | |
if (uValue0 == lookUp || uValue1 == lookUp) | |
goto Found; | |
offset += 1; | |
lengthToExamine -= 1; | |
} | |
NotFound: | |
return -1; | |
Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 | |
return (int)offset; | |
Found1: | |
return (int)(offset + 1); | |
Found2: | |
return (int)(offset + 2); | |
Found3: | |
return (int)(offset + 3); | |
Found4: | |
return (int)(offset + 4); | |
Found5: | |
return (int)(offset + 5); | |
Found6: | |
return (int)(offset + 6); | |
Found7: | |
return (int)(offset + 7); | |
IntrinsicsCompare: | |
// When we move into a Vectorized block, we process everything of Vector size; | |
// and then for any remainder we do a final compare of Vector size but starting at | |
// the end and forwards, which may overlap on an earlier compare. | |
// We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. | |
if (Sse2.IsSupported) | |
{ | |
int matches; | |
if (Avx2.IsSupported) | |
{ | |
Vector256<byte> search; | |
// Guard as we may only have a valid size for Vector128; when we will move to the Sse2 | |
// We have already subtracted Vector128<byte>.Count from lengthToExamine so compare against that | |
// to see if we have double the size for Vector256<byte>.Count | |
if (lengthToExamine >= (nuint)Vector128<byte>.Count) | |
{ | |
Vector256<byte> values0 = Vector256.Create(value0); | |
Vector256<byte> values1 = Vector256.Create(value1); | |
// Subtract Vector128<byte>.Count so we have now subtracted Vector256<byte>.Count | |
lengthToExamine -= (nuint)Vector128<byte>.Count; | |
// First time this checks again against 0, however we will move into final compare if it fails. | |
while (lengthToExamine > offset) | |
{ | |
search = LoadVector256(ref searchSpace, offset); | |
// Bitwise Or to combine the flagged matches for the second value to our match flags | |
matches = Avx2.MoveMask( | |
Avx2.Or( | |
Avx2.CompareEqual(values0, search), | |
Avx2.CompareEqual(values1, search))); | |
// Note that MoveMask has converted the equal vector elements into a set of bit flags, | |
// So the bit position in 'matches' corresponds to the element offset. | |
if (matches == 0) | |
{ | |
// None matched | |
offset += (nuint)Vector256<byte>.Count; | |
continue; | |
} | |
goto IntrinsicsMatch; | |
} | |
// Move to Vector length from end for final compare | |
search = LoadVector256(ref searchSpace, lengthToExamine); | |
offset = lengthToExamine; | |
// Same as method as above | |
matches = Avx2.MoveMask( | |
Avx2.Or( | |
Avx2.CompareEqual(values0, search), | |
Avx2.CompareEqual(values1, search))); | |
if (matches == 0) | |
{ | |
// None matched | |
goto NotFound; | |
} | |
goto IntrinsicsMatch; | |
} | |
} | |
// Initial size check was done on method entry. | |
Debug.Assert(length >= Vector128<byte>.Count); | |
{ | |
Vector128<byte> search; | |
Vector128<byte> values0 = Vector128.Create(value0); | |
Vector128<byte> values1 = Vector128.Create(value1); | |
// First time this checks against 0 and we will move into final compare if it fails. | |
while (lengthToExamine > offset) | |
{ | |
search = LoadVector128(ref searchSpace, offset); | |
matches = Sse2.MoveMask( | |
Sse2.Or( | |
Sse2.CompareEqual(values0, search), | |
Sse2.CompareEqual(values1, search)) | |
.AsByte()); | |
// Note that MoveMask has converted the equal vector elements into a set of bit flags, | |
// So the bit position in 'matches' corresponds to the element offset. | |
if (matches == 0) | |
{ | |
// None matched | |
offset += (nuint)Vector128<byte>.Count; | |
continue; | |
} | |
goto IntrinsicsMatch; | |
} | |
// Move to Vector length from end for final compare | |
search = LoadVector128(ref searchSpace, lengthToExamine); | |
offset = lengthToExamine; | |
// Same as method as above | |
matches = Sse2.MoveMask( | |
Sse2.Or( | |
Sse2.CompareEqual(values0, search), | |
Sse2.CompareEqual(values1, search))); | |
if (matches == 0) | |
{ | |
// None matched | |
goto NotFound; | |
} | |
} | |
IntrinsicsMatch: | |
// Find bitflag offset of first difference and add to current offset | |
offset += (nuint)BitOperations.TrailingZeroCount(matches); | |
goto Found; | |
} | |
else if (AdvSimd.Arm64.IsSupported) | |
{ | |
// Mask to help find the first lane in compareResult that is set. | |
// LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. | |
Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte(); | |
int matchedLane = 0; | |
Vector128<byte> search; | |
Vector128<byte> matches; | |
Vector128<byte> values0 = Vector128.Create(value0); | |
Vector128<byte> values1 = Vector128.Create(value1); | |
// First time this checks against 0 and we will move into final compare if it fails. | |
while (lengthToExamine > offset) | |
{ | |
search = LoadVector128(ref searchSpace, offset); | |
matches = AdvSimd.Or( | |
AdvSimd.CompareEqual(values0, search), | |
AdvSimd.CompareEqual(values1, search)); | |
if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) | |
{ | |
// Zero flags set so no matches | |
offset += (nuint)Vector128<byte>.Count; | |
continue; | |
} | |
// Find bitflag offset of first match and add to current offset | |
offset += (uint)matchedLane; | |
goto Found; | |
} | |
// Move to Vector length from end for final compare | |
search = LoadVector128(ref searchSpace, lengthToExamine); | |
offset = lengthToExamine; | |
// Same as method as above | |
matches = AdvSimd.Or( | |
AdvSimd.CompareEqual(values0, search), | |
AdvSimd.CompareEqual(values1, search)); | |
if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) | |
{ | |
// None matched | |
goto NotFound; | |
} | |
// Find bitflag offset of first match and add to current offset | |
offset += (nuint)(uint)matchedLane; | |
goto Found; | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
Vector<byte> values0 = new Vector<byte>(value0); | |
Vector<byte> values1 = new Vector<byte>(value1); | |
Vector<byte> search; | |
// First time this checks against 0 and we will move into final compare if it fails. | |
while (lengthToExamine > offset) | |
{ | |
search = LoadVector(ref searchSpace, offset); | |
search = Vector.BitwiseOr( | |
Vector.Equals(search, values0), | |
Vector.Equals(search, values1)); | |
if (Vector<byte>.Zero.Equals(search)) | |
{ | |
// None matched | |
offset += (nuint)Vector<byte>.Count; | |
continue; | |
} | |
goto VectorMatch; | |
} | |
// Move to Vector length from end for final compare | |
search = LoadVector(ref searchSpace, lengthToExamine); | |
offset = lengthToExamine; | |
search = Vector.BitwiseOr( | |
Vector.Equals(search, values0), | |
Vector.Equals(search, values1)); | |
if (Vector<byte>.Zero.Equals(search)) | |
{ | |
// None matched | |
goto NotFound; | |
} | |
VectorMatch: | |
offset += (nuint)LocateFirstFoundByte(search); | |
goto Found; | |
} | |
Debug.Fail("Unreachable"); | |
goto NotFound; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment