Last active
March 6, 2019 22:43
-
-
Save redknightlois/0c2a301006f817e41c364666a422ca63 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Linq; | |
using System.Numerics; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics.X86; | |
using System.Runtime.InteropServices; | |
using BenchmarkDotNet.Analysers; | |
using BenchmarkDotNet.Attributes; | |
using BenchmarkDotNet.Columns; | |
using BenchmarkDotNet.Configs; | |
using BenchmarkDotNet.Diagnosers; | |
using BenchmarkDotNet.Environments; | |
using BenchmarkDotNet.Exporters; | |
using BenchmarkDotNet.Jobs; | |
using BenchmarkDotNet.Validators; | |
using BenchmarkDotNet.Running; | |
using RunMode = BenchmarkDotNet.Jobs.RunMode; | |
using System.Diagnostics; | |
using System.Runtime.Intrinsics; | |
namespace ConsoleApp3 | |
{ | |
internal static class BitOps | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public static int TrailingZeroCount(int matches) | |
{ | |
if (Bmi1.IsSupported) | |
{ | |
return (int)Bmi1.TrailingZeroCount((uint)matches); | |
} | |
else // Software fallback | |
{ | |
// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup | |
// uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check | |
return Unsafe.AddByteOffset( | |
ref MemoryMarshal.GetReference(TrailingCountMultiplyDeBruijn), | |
new IntPtr(((uint)((matches & -matches) * 0x077CB531U)) >> 27)); | |
} | |
} | |
private static ReadOnlySpan<byte> TrailingCountMultiplyDeBruijn => new byte[32] | |
{ | |
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, | |
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 | |
}; | |
} | |
internal static class Bits | |
{ | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
public static int TrailingZeroesInBytes(ulong value) | |
{ | |
// uint.MaxValue >> 58 is always in range [0 - 63] so we use Unsafe.AddByteOffset to avoid bounds check | |
return Unsafe.AddByteOffset( | |
ref MemoryMarshal.GetReference(DeBruijnBytePos64), | |
new IntPtr((int)((value & (ulong)(-(long)value)) * 0x0218A392CDABBD3FUL) >> 58)); | |
} | |
private static ReadOnlySpan<byte> DeBruijnBytePos64 => new byte[64] | |
{ | |
0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, | |
7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 | |
}; | |
} | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
Console.WriteLine($"{nameof(Sse)} support: {Sse.IsSupported}"); | |
Console.WriteLine($"{nameof(Sse2)} support: {Sse2.IsSupported}"); | |
Console.WriteLine($"{nameof(Sse3)} support: {Sse3.IsSupported}"); | |
Console.WriteLine($"{nameof(Sse41)} support: {Sse41.IsSupported}"); | |
Console.WriteLine($"{nameof(Avx)} support: {Avx.IsSupported}"); | |
Console.WriteLine($"{nameof(Avx2)} support: {Avx2.IsSupported}"); | |
//var p = new Compare(); | |
//p.KeySize = 4096 * 4096 - 1; | |
//p.Setup(); | |
//for (int i = 0; i < 100; i++) | |
// p.WithCacheMisses_Framework_SequenceCompareTo_Alternative(); | |
//for (int i = 0; i < 100; i++) | |
// p.WithCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8(); | |
BenchmarkRunner.Run<Compare>(); | |
} | |
} | |
//[HardwareCounters(HardwareCounter.CacheMisses, HardwareCounter.TotalIssues, HardwareCounter.BranchMispredictions, HardwareCounter.InstructionRetired )] | |
//[DisassemblyDiagnoser] | |
[Config(typeof(Compare.Config))] | |
public unsafe class Compare | |
{ | |
private class Config : ManualConfig | |
{ | |
public Config() | |
{ | |
Add(new Job(RunMode.Default) | |
{ | |
Environment = | |
{ | |
Runtime = Runtime.Core, | |
Platform = Platform.X64, | |
Jit = Jit.RyuJit | |
} | |
}); | |
// Exporters for data | |
Add(GetExporters().ToArray()); | |
// Generate plots using R if %R_HOME% is correctly set | |
Add(RPlotExporter.Default); | |
Add(StatisticColumn.AllStatistics); | |
Add(BaselineValidator.FailOnError); | |
Add(JitOptimizationsValidator.FailOnError); | |
Add(EnvironmentAnalyser.Default); | |
} | |
} | |
//[Params(7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 1024, 2048, 4096, 4096 * 16, 4096 * 4096)] | |
//[Params(15, 31, 39, 48, 50, 63, 90, 117, 127, 190, 255, 256)] | |
//[Params(7, 8, 15, 16, 24, 31, 32, 64, 127, 128, 256, 1024, 2048, 4096)] | |
//[Params(7, 15, 16, 27, 32, 33, 4096, 4096 * 16, 4096 * 4096)] | |
[Params(63, 4095, 4096 * 16 - 1)] | |
public int KeySize { get; set; } | |
public const int Operations = 100; | |
private int size = 1024 * 1024 * 100; | |
private byte* source; | |
private byte* destination; | |
private int[] randomLocation; | |
private int[] sameLocation; | |
public const int VectorBytes = 32; | |
[GlobalSetup] | |
public void Setup() | |
{ | |
if (Vector<byte>.Count != VectorBytes) | |
throw new NotSupportedException(""); | |
source = (byte*)Marshal.AllocHGlobal(size).ToPointer(); | |
destination = (byte*)Marshal.AllocHGlobal(size).ToPointer(); | |
var r = new Random(); | |
for (int i = 0; i < size; i++) | |
{ | |
if (size % KeySize == KeySize - 1) | |
{ | |
source[i] = 0; | |
destination[i] = 1; | |
} | |
else | |
{ | |
int b = r.Next(); | |
source[i] = (byte)b; | |
destination[i] = (byte)b; | |
} | |
} | |
randomLocation = new int[Operations]; | |
sameLocation = new int[Operations]; | |
int range = (size / KeySize) - 1; | |
for (int i = 0; i < randomLocation.Length; i++) | |
{ | |
randomLocation[i] = r.Next(range) * KeySize; | |
sameLocation[i] = 0; | |
} | |
} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
public int WithCacheMisses_Scalar() | |
{ | |
int r = 0; | |
foreach (int index in randomLocation) | |
r += CompareOriginal(source + index, destination + index, KeySize); | |
return r; | |
} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
public int WithCacheMisses_Framework_MemoryExtensions_Reference() | |
{ | |
int r = 0; | |
foreach (int index in randomLocation) | |
r += MemoryExtensions.SequenceCompareTo(new ReadOnlySpan<byte>(source + index, KeySize), new ReadOnlySpan<byte>(destination + index, KeySize)); | |
return r; | |
} | |
[Benchmark(OperationsPerInvoke = Operations)] | |
public int WithCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8() | |
{ | |
int r = 0; | |
foreach (int index in randomLocation) | |
r += SequenceCompareTo_Ben5f6dcb8(ref *(source + index), KeySize, ref *(destination + index), KeySize); | |
return r; | |
} | |
[Benchmark(OperationsPerInvoke = Operations)] | |
public int WithCacheMisses_Framework_SequenceCompareTo_Alternative() | |
{ | |
int r = 0; | |
foreach (int index in randomLocation) | |
r += SequenceCompareTo_Alternative(ref *(source + index), KeySize, ref *(destination + index), KeySize); | |
return r; | |
} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
public int WithCacheMisses_Numerics32() | |
{ | |
int r = 0; | |
foreach (int index in randomLocation) | |
r += CompareNumerics(source + index, destination + index, KeySize); | |
return r; | |
} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_ScalarXor() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareScalarAltDeBrujain(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_ScalarCmpXorPopCount() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareScalarCmpAltPopCount(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_ScalarCmpXorPopCount_Prefetch() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareScalarCmpAltPopCount_Prefetch(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_ScalarAlt2() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareScalarAlt2(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_ScalarAlt3() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareScalarAlt3(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark(OperationsPerInvoke = Operations)] | |
//public int WithCacheMisses_NumericsAlt32() | |
//{ | |
// int r = 0; | |
// foreach (int index in randomLocation) | |
// r += CompareNumericsAlt(source + index, destination + index, KeySize); | |
// return r; | |
//} | |
//[Benchmark] | |
public int NoCacheMisses_Scalar() | |
{ | |
return CompareOriginal(source, destination, KeySize); | |
} | |
//[Benchmark] | |
public int NoCacheMisses_Framework_MemoryExtensions_Reference() | |
{ | |
return MemoryExtensions.SequenceCompareTo(new Span<byte>(source, KeySize), new Span<byte>(destination, KeySize)); | |
} | |
[Benchmark] | |
public int NoCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8() | |
{ | |
return SequenceCompareTo_Ben5f6dcb8(ref *(source), KeySize, ref *(destination), KeySize); | |
} | |
[Benchmark] | |
public int NoCacheMisses_Framework_SequenceCompareTo_Alternative() | |
{ | |
return SequenceCompareTo_Alternative(ref *(source), KeySize, ref *(destination), KeySize); | |
} | |
//[Benchmark] | |
//public int NoCacheMisses_ScalarXor() | |
//{ | |
// return CompareScalarAltDeBrujain(source, destination, KeySize); | |
//} | |
//[Benchmark] | |
//public int NoCacheMisses_ScalarXorPopCount() | |
//{ | |
// return CompareScalarAltPopCount(source, destination, KeySize); | |
//} | |
//[Benchmark] | |
//public int NoCacheMisses_ScalarCmpXorPopCount() | |
//{ | |
// return CompareScalarCmpAltPopCount(source, destination, KeySize); | |
//} | |
//[Benchmark] | |
//public int NoCacheMisses_ScalarCmpXorPopCount_Prefetch() | |
//{ | |
// return CompareScalarCmpAltPopCount_Prefetch(source, destination, KeySize); | |
//} | |
//[Benchmark] | |
public int NoCacheMisses_Numerics32() | |
{ | |
return CompareNumerics(source, destination, KeySize); | |
} | |
//[Benchmark] | |
//public int NoCacheMisses_NumericsAlt32() | |
//{ | |
// return CompareNumericsAlt(source, destination, KeySize); | |
//} | |
private static int CompareScalarAltPopCount(void* p1, void* p2, int size) | |
{ | |
byte* bpx = (byte*)p1; | |
byte* bpy = (byte*)p2; | |
// PERF: This allows us to do pointer arithmetics and use relative addressing using the | |
// hardware instructions without needed an extra register. | |
long offset = bpy - bpx; | |
if (size < 8) | |
goto ProcessSmall; | |
// PERF: Current version of the JIT (2.0.5) will use a 4 instruction magic division | |
// instead of a simple shift because it is a power of 2 dividend. | |
int l = size >> 3; // (Equivalent to size / 8) | |
ulong xor; | |
for (int i = 0; i < l; i++, bpx += 8) | |
{ | |
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` | |
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset); | |
if (xor != 0) | |
goto Tail; | |
} | |
ProcessSmall: | |
if ((size & 4) != 0) | |
{ | |
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset)); | |
if (xor != 0) | |
goto Tail; | |
bpx += 4; | |
} | |
if ((size & 2) != 0) | |
{ | |
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset))); | |
if (xor != 0) | |
goto Tail; | |
bpx += 2; | |
} | |
if ((size & 1) != 0) | |
{ | |
return *bpx - *(bpx + offset); | |
} | |
return 0; | |
Tail: | |
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, | |
// we can use that we know we are running on little endian hardware and the very first bit set | |
// will correspond to the first byte which is different. | |
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3; | |
return *bpx - *(bpx + offset); | |
} | |
private static int CompareScalarCmpAltPopCount_Prefetch(void* p1, void* p2, int size) | |
{ | |
Sse.Prefetch2(p1); | |
Sse.Prefetch2(p2); | |
byte* bpx = (byte*)p1; | |
// PERF: This allows us to do pointer arithmetics and use relative addressing using the | |
// hardware instructions without needed an extra register. | |
long offset = (byte*)p2 - bpx; | |
if ((size & 7) == 0) | |
goto ProcessAligned; | |
// We process first the "unaligned" size. | |
ulong xor; | |
if ((size & 4) != 0) | |
{ | |
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset)); | |
if (xor != 0) | |
goto Tail; | |
bpx += 4; | |
} | |
if ((size & 2) != 0) | |
{ | |
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset))); | |
if (xor != 0) | |
goto Tail; | |
bpx += 2; | |
} | |
if ((size & 1) != 0) | |
{ | |
int value = *bpx - *(bpx + offset); | |
if (value != 0) | |
return value; | |
bpx += 1; | |
} | |
ProcessAligned: | |
byte* end = (byte*)p1 + size; | |
byte* loopEnd = end - 16; | |
while (bpx <= loopEnd) | |
{ | |
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` | |
if (*((ulong*)bpx) != *(ulong*)(bpx + offset)) | |
goto XorTail; | |
if (*((ulong*)(bpx + 8)) != *(ulong*)(bpx + 8 + offset)) | |
{ | |
bpx += 8; | |
goto XorTail; | |
} | |
bpx += 16; | |
} | |
if (bpx < end) | |
goto XorTail; | |
return 0; | |
XorTail: | |
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset); | |
Tail: | |
// Fast-path for equals | |
if (xor == 0) | |
return 0; | |
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, | |
// we can use that we know we are running on little endian hardware and the very first bit set | |
// will correspond to the first byte which is different. | |
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3; | |
return *bpx - *(bpx + offset); | |
} | |
private static int CompareScalarCmpAltPopCount(void* p1, void* p2, int size) | |
{ | |
byte* bpx = (byte*)p1; | |
// PERF: This allows us to do pointer arithmetics and use relative addressing using the | |
// hardware instructions without needed an extra register. | |
long offset = (byte*)p2 - bpx; | |
if ((size & 7) == 0) | |
goto ProcessAligned; | |
// We process first the "unaligned" size. | |
ulong xor; | |
if ((size & 4) != 0) | |
{ | |
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset)); | |
if (xor != 0) | |
goto Tail; | |
bpx += 4; | |
} | |
if ((size & 2) != 0) | |
{ | |
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset))); | |
if (xor != 0) | |
goto Tail; | |
bpx += 2; | |
} | |
if ((size & 1) != 0) | |
{ | |
int value = *bpx - *(bpx + offset); | |
if (value != 0) | |
return value; | |
bpx += 1; | |
} | |
ProcessAligned: | |
byte* end = (byte*)p1 + size; | |
byte* loopEnd = end - 16; | |
while (bpx <= loopEnd) | |
{ | |
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` | |
if (*((ulong*)bpx) != *(ulong*)(bpx + offset)) | |
goto XorTail; | |
if (*((ulong*)(bpx + 8)) != *(ulong*)(bpx + 8 + offset)) | |
{ | |
bpx += 8; | |
goto XorTail; | |
} | |
bpx += 16; | |
} | |
if (bpx < end) | |
goto XorTail; | |
return 0; | |
XorTail: xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset); | |
Tail: | |
// Fast-path for equals | |
if (xor == 0) | |
return 0; | |
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, | |
// we can use that we know we are running on little endian hardware and the very first bit set | |
// will correspond to the first byte which is different. | |
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3; | |
return *bpx - *(bpx + offset); | |
} | |
private static int CompareScalarAltDeBrujain(void* p1, void* p2, int size) | |
{ | |
byte* bpx = (byte*)p1; | |
// PERF: This allows us to do pointer arithmetics and use relative addressing using the | |
// hardware instructions without needed an extra register. | |
long offset = (byte*)p2 - bpx; | |
if (size < 8) | |
goto ProcessSmall; | |
// PERF: Current version of the JIT (2.0.5) will use a 4 instruction magic division | |
// instead of a simple shift because it is a power of 2 dividend. | |
int l = size >> 3; // (Equivalent to size / 8) | |
ulong xor; | |
for (int i = 0; i < l; i++, bpx += 8) | |
{ | |
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]``` | |
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset); | |
if (xor != 0) | |
goto Tail; | |
} | |
ProcessSmall: | |
if ((size & 4) != 0) | |
{ | |
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset)); | |
if (xor != 0) | |
goto Tail; | |
bpx += 4; | |
} | |
if ((size & 2) != 0) | |
{ | |
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset))); | |
if (xor != 0) | |
goto Tail; | |
bpx += 2; | |
} | |
if ((size & 1) != 0) | |
{ | |
return *bpx - *(bpx + offset); | |
} | |
return 0; | |
Tail: | |
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference, | |
// we can use that we know we are running on little endian hardware and the very first bit set | |
// will correspond to the first byte which is different. | |
bpx += Bits.TrailingZeroesInBytes(xor); | |
return *bpx - *(bpx + offset); | |
} | |
private static int CompareScalarAlt2(void* p1, void* p2, int size) | |
{ | |
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly. | |
// If you know you will be comparing big memory chunks do not use the inline version. | |
byte* bpx = (byte*)p1; | |
byte* bpy = (byte*)p2; | |
long offset = bpy - bpx; | |
if (size < 8) | |
goto ProcessSmall; | |
int l = size >> 3; // (Equivalent to size / 8) | |
for (int i = 0; i < l; i++, bpx += 8) | |
{ | |
if (*((long*)bpx) != *((long*)(bpx + offset))) | |
{ | |
// We reset the size to account for knowing that we are performing this test on 4 bytes only | |
size = 4; | |
goto ProcessWord; | |
} | |
} | |
goto ProcessSmall; | |
ProcessWord: | |
// We know that we have 8 valid bytes to process and they are different somewhere. | |
// We then check if half of it is different. | |
if (*((int*)bpx) == *((int*)(bpx + offset))) | |
bpx += 4; | |
ProcessSmall: | |
if ((size & 4) != 0) | |
{ | |
if (*((int*)bpx) == *((int*)(bpx + offset))) | |
{ | |
bpx += 4; | |
} | |
} | |
if ((size & 2) != 0) | |
{ | |
if (*((short*)bpx) == *((short*)(bpx + offset))) | |
{ | |
bpx += 2; | |
} | |
} | |
if ((size & 1) != 0) | |
{ | |
return *bpx - *(bpx + offset); | |
} | |
return 0; | |
} | |
private static int CompareScalarAlt3(void* p1, void* p2, int size) | |
{ | |
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly. | |
// If you know you will be comparing big memory chunks do not use the inline version. | |
byte* bpx = (byte*)p1; | |
byte* bpy = (byte*)p2; | |
long offset = bpy - bpx; | |
if (size < 8) | |
goto ProcessSmall; | |
int l = size >> 3; // (Equivalent to size / 8) | |
for (int i = 0; i < l; i++, bpx += 8) | |
{ | |
if (*((long*)bpx) != *((long*)(bpx + offset))) | |
break; | |
} | |
// We reset the size to account for knowing that we are performing this test on 4 bytes only | |
size = 4; | |
if (*((int*)bpx) == *((int*)(bpx + offset))) | |
bpx += 4; | |
ProcessSmall: | |
if ((size & 4) != 0) | |
{ | |
if (*((int*)bpx) == *((int*)(bpx + offset))) | |
{ | |
bpx += 4; | |
} | |
} | |
if (((byte)size & 2) != 0) | |
{ | |
if (*((short*)bpx) == *((short*)(bpx + offset))) | |
{ | |
bpx += 2; | |
} | |
} | |
if (((byte)size & 1) != 0) | |
{ | |
return *bpx - *(bpx + offset); | |
} | |
return 0; | |
} | |
private static int CompareOriginal(void* p1, void* p2, int size) | |
{ | |
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly. | |
// If you know you will be comparing big memory chunks do not use the inline version. | |
int l = size; | |
byte* bpx = (byte*)p1, bpy = (byte*)p2; | |
int last; | |
for (int i = 0; i < l / 8; i++, bpx += 8, bpy += 8) | |
{ | |
if (*((long*)bpx) != *((long*)bpy)) | |
{ | |
last = 8; | |
goto Tail; | |
} | |
} | |
if ((l & 4) != 0) | |
{ | |
if (*((int*)bpx) != *((int*)bpy)) | |
{ | |
last = 4; | |
goto Tail; | |
} | |
bpx += 4; | |
bpy += 4; | |
} | |
if ((l & 2) != 0) | |
{ | |
if (*((short*)bpx) != *((short*)bpy)) | |
{ | |
last = 2; | |
goto Tail; | |
} | |
bpx += 2; | |
bpy += 2; | |
} | |
if ((l & 1) != 0) | |
{ | |
return (*((byte*)bpx) - *((byte*)bpy)); | |
} | |
return 0; | |
Tail: | |
while (last > 0) | |
{ | |
if (*((byte*)bpx) != *((byte*)bpy)) | |
return *bpx - *bpy; | |
bpx++; | |
bpy++; | |
last--; | |
} | |
return 0; | |
} | |
private static int CompareNumericsAlt(void* p1, void* p2, int size) | |
{ | |
byte* bpx = (byte*)p1, bpy = (byte*)p2; | |
byte* end = bpx + size; | |
byte* currentEnd = end - (size & (32 - 1)); | |
while (bpx < currentEnd) | |
{ | |
var vx = Unsafe.Read<Vector<byte>>(bpx); | |
var vy = Unsafe.Read<Vector<byte>>(bpy); | |
var xor = Vector.Xor(vx, vy); | |
if (xor == Vector<byte>.Zero) | |
break; | |
bpx += 32; | |
bpy += 32; | |
} | |
currentEnd = end - (size & (sizeof(long) - 1)); | |
while (bpx < currentEnd) | |
{ | |
ulong vx = ((ulong*)bpx)[0]; | |
ulong vy = ((ulong*)bpy)[0]; | |
if (vx != vy) | |
break; | |
bpx += 8; | |
bpy += 8; | |
} | |
while (bpx < end) | |
{ | |
int r = *bpx - *bpy; | |
if (r != 0) | |
return r; | |
bpx += 1; | |
bpy += 1; | |
} | |
return 0; | |
} | |
private static int CompareNumerics(void* p1, void* p2, int size) | |
{ | |
byte* bpx = (byte*)p1, bpy = (byte*)p2; | |
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly. | |
// If you know you will be comparing big memory chunks do not use the inline version. | |
int l = size / VectorBytes; // This should translate into a shift operation. | |
size -= l * VectorBytes; // This should translate into a shift operation. | |
while (l > 0) | |
{ | |
var vx = Unsafe.Read<Vector<byte>>(bpx); | |
var vy = Unsafe.Read<Vector<byte>>(bpy); | |
var xor = Vector.Xor(vx, vy); | |
if (xor != Vector<byte>.Zero) | |
break; | |
l--; | |
bpx += VectorBytes; | |
bpy += VectorBytes; | |
} | |
if (size <= 8) | |
goto Last; | |
if (size > 8 && ((long*)bpx)[0] != ((long*)bpy)[0]) | |
goto Last; | |
if (size > 16 && ((long*)bpx)[1] != ((long*)bpy)[1]) | |
goto Last; | |
if (size > 24 && ((long*)bpx)[2] != ((long*)bpy)[2]) | |
goto Last; | |
if (size == 32 && ((long*)bpx)[3] != ((long*)bpy)[3]) | |
goto Last; | |
return 0; | |
Last: | |
size %= 8; // This should translate to a AND operation. | |
int last = 0; | |
while (size > 0) | |
{ | |
int r = bpx[last] - bpy[last]; | |
if (r != 0) | |
return r; | |
size--; | |
last++; | |
} | |
return 0; | |
} | |
public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref byte second, int secondLength) | |
{ | |
Debug.Assert(firstLength >= 0); | |
Debug.Assert(secondLength >= 0); | |
if (Unsafe.AreSame(ref first, ref second)) | |
goto Equal; | |
IntPtr minLength = (IntPtr)((firstLength < secondLength) ? firstLength : secondLength); | |
IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations | |
IntPtr nLength = (IntPtr)(void*)minLength; | |
if (Avx2.IsSupported) | |
{ | |
if ((byte*)nLength >= (byte*)Vector256<byte>.Count) | |
{ | |
nLength -= Vector256<byte>.Count; | |
int matches; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset))); | |
if (matches == -1) | |
{ | |
// All matched | |
offset += Vector256<byte>.Count; | |
continue; | |
} | |
goto Difference; | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset))); | |
if (matches == -1) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
if ((byte*)nLength >= (byte*)Vector128<byte>.Count) | |
{ | |
nLength -= Vector128<byte>.Count; | |
int matches; | |
if ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
offset += Vector128<byte>.Count; | |
} | |
else | |
{ | |
goto Difference; | |
} | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
} | |
else if (Sse2.IsSupported) | |
{ | |
if ((byte*)nLength >= (byte*)Vector128<byte>.Count) | |
{ | |
nLength -= Vector128<byte>.Count; | |
int matches; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
offset += Vector128<byte>.Count; | |
continue; | |
} | |
goto Difference; | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
if ((byte*)nLength > (byte*)Vector<byte>.Count) | |
{ | |
nLength -= Vector<byte>.Count; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
if (LoadVector(ref first, offset) != LoadVector(ref second, offset)) | |
{ | |
goto NotEqual; | |
} | |
offset += Vector<byte>.Count; | |
} | |
goto NotEqual; | |
} | |
} | |
if ((byte*)nLength > (byte*)sizeof(UIntPtr)) | |
{ | |
nLength -= sizeof(UIntPtr); | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
if (LoadUIntPtr(ref first, offset) != LoadUIntPtr(ref second, offset)) | |
{ | |
goto NotEqual; | |
} | |
offset += sizeof(UIntPtr); | |
} | |
} | |
NotEqual: // Workaround for https://github.com/dotnet/coreclr/issues/13549 | |
while ((byte*)minLength > (byte*)offset) | |
{ | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
if (result != 0) | |
return result; | |
offset += 1; | |
} | |
Equal: | |
return firstLength - secondLength; | |
} | |
public static unsafe int SequenceCompareTo_Ben5f6dcb8(ref byte first, int firstLength, ref byte second, int secondLength) | |
{ | |
Debug.Assert(firstLength >= 0); | |
Debug.Assert(secondLength >= 0); | |
if (Unsafe.AreSame(ref first, ref second)) | |
goto Equal; | |
IntPtr minLength = (IntPtr)((firstLength < secondLength) ? firstLength : secondLength); | |
IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations | |
IntPtr nLength = (IntPtr)(void*)minLength; | |
if (Avx2.IsSupported) | |
{ | |
if ((byte*)nLength >= (byte*)Vector256<byte>.Count) | |
{ | |
nLength -= Vector256<byte>.Count; | |
int matches; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset))); | |
if (matches == -1) | |
{ | |
// All matched | |
offset += Vector256<byte>.Count; | |
continue; | |
} | |
goto Difference; | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset))); | |
if (matches == -1) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
if ((byte*)nLength >= (byte*)Vector128<byte>.Count) | |
{ | |
nLength -= Vector128<byte>.Count; | |
int matches; | |
if ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
offset += Vector128<byte>.Count; | |
} | |
else | |
{ | |
goto Difference; | |
} | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
} | |
else if (Sse2.IsSupported) | |
{ | |
if ((byte*)nLength >= (byte*)Vector128<byte>.Count) | |
{ | |
nLength -= Vector128<byte>.Count; | |
int matches; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
offset += Vector128<byte>.Count; | |
continue; | |
} | |
goto Difference; | |
} | |
// Move to Vector length from end for final compare | |
offset = nLength; | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset))); | |
if (matches == 0xFFFF) | |
{ | |
// All matched | |
goto Equal; | |
} | |
Difference: | |
// Invert matches to find differences | |
int differences = ~matches; | |
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences)); | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
Debug.Assert(result != 0); | |
return result; | |
} | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
if ((byte*)nLength > (byte*)Vector<byte>.Count) | |
{ | |
nLength -= Vector<byte>.Count; | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
if (LoadVector(ref first, offset) != LoadVector(ref second, offset)) | |
{ | |
goto BytewiseCheck; | |
} | |
offset += Vector<byte>.Count; | |
} | |
goto BytewiseCheck; | |
} | |
} | |
if ((byte*)nLength > (byte*)sizeof(UIntPtr)) | |
{ | |
nLength -= sizeof(UIntPtr); | |
while ((byte*)nLength > (byte*)offset) | |
{ | |
if (LoadUIntPtr(ref first, offset) != LoadUIntPtr(ref second, offset)) | |
{ | |
goto BytewiseCheck; | |
} | |
offset += sizeof(UIntPtr); | |
} | |
} | |
BytewiseCheck: // Workaround for https://github.com/dotnet/coreclr/issues/13549 | |
while ((byte*)minLength > (byte*)offset) | |
{ | |
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); | |
if (result != 0) | |
return result; | |
offset += 1; | |
} | |
Equal: | |
return firstLength - secondLength; | |
} | |
public static unsafe int SequenceCompareTo_Alternative(ref byte firstOriginal, int firstLength, ref byte secondOriginal, int secondLength) | |
{ | |
Debug.Assert(firstLength >= 0); | |
Debug.Assert(secondLength >= 0); | |
if (Unsafe.AreSame(ref firstOriginal, ref secondOriginal)) | |
goto Equal; | |
long offset = ((firstLength < secondLength) ? firstLength : secondLength); | |
if (Avx2.IsSupported) | |
{ | |
ref byte first = ref Unsafe.AddByteOffset(ref firstOriginal, (IntPtr)offset); | |
ref byte second = ref Unsafe.AddByteOffset(ref secondOriginal, (IntPtr)offset); | |
// PERF: We force a sign extension to avoid the rest of the operations to hit sign extensions conversions on every use. | |
offset = -offset; | |
int matches; | |
while ((int)offset <= -Vector256<byte>.Count) | |
{ | |
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, (IntPtr)offset), LoadVector256(ref second, (IntPtr)offset))); | |
if (matches == -1) | |
{ | |
// All matched | |
offset += Vector256<byte>.Count; | |
} | |
else goto Difference; | |
} | |
if ((int)offset <= -Vector128<byte>.Count) | |
{ | |
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, (IntPtr)offset), LoadVector128(ref second, (IntPtr)offset))); | |
matches ^= 0x0000FFFF; | |
if (matches != 0) | |
goto Difference; | |
offset += Vector128<byte>.Count; | |
} | |
if ((int)offset <= -sizeof(long)) | |
{ | |
ulong xor = (Unsafe.ReadUnaligned<ulong>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<ulong>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset))); | |
if (xor != 0) | |
{ | |
matches = (int)(xor >> sizeof(int)); | |
goto DifferenceExtract; | |
} | |
offset += sizeof(long); | |
} | |
if ((int)offset <= -sizeof(int)) | |
{ | |
matches = (int)(Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset))); | |
if (matches != 0) | |
goto DifferenceExtract; | |
offset += sizeof(int); | |
} | |
if ((int)offset <= -sizeof(short)) | |
{ | |
matches = (int)(Unsafe.ReadUnaligned<short>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<short>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset))); | |
if (matches != 0) | |
{ | |
matches <<= sizeof(int) - sizeof(short); | |
goto DifferenceExtract; | |
} | |
offset += sizeof(short); | |
} | |
if ((int)offset <= -sizeof(byte)) | |
{ | |
return Unsafe.ReadUnaligned<byte>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) - Unsafe.ReadUnaligned<byte>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset)); | |
} | |
return 0; | |
DifferenceExtract: | |
offset = (int)(Bmi1.ExtractLowestSetBit((uint)matches)); | |
Difference: | |
int result = Unsafe.AddByteOffset(ref first, (IntPtr)offset).CompareTo(Unsafe.AddByteOffset(ref second, (IntPtr)offset)); | |
if (result != 0) | |
return result; | |
return 0; | |
} | |
else if (Sse2.IsSupported) | |
{ | |
throw new NotImplementedException(); | |
} | |
else if (Vector.IsHardwareAccelerated) | |
{ | |
throw new NotImplementedException(); | |
} | |
Equal: | |
return firstLength - secondLength; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe UIntPtr LoadUIntPtr(ref byte start, IntPtr offset) | |
=> Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe Vector<byte> LoadVector(ref byte start, IntPtr offset) | |
=> Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe Vector128<byte> LoadVector128(ref byte start, IntPtr offset) | |
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe Vector256<byte> LoadVector256(ref byte start, IntPtr offset) | |
=> Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr GetByteVectorSpanLength(IntPtr offset, int length) | |
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector<byte>.Count - 1)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr GetByteVector128SpanLength(IntPtr offset, int length) | |
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector128<byte>.Count - 1)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr GetByteVector256SpanLength(IntPtr offset, int length) | |
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector256<byte>.Count - 1)); | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr UnalignedByteCountVector(ref byte searchSpace) | |
{ | |
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1); | |
return (IntPtr)((Vector<byte>.Count - unaligned) & (Vector<byte>.Count - 1)); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr UnalignedByteCountVector128(ref byte searchSpace) | |
{ | |
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector128<byte>.Count - 1); | |
return (IntPtr)((Vector128<byte>.Count - unaligned) & (Vector128<byte>.Count - 1)); | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private static unsafe IntPtr UnalignedByteCountVectorFromEnd(ref byte searchSpace, int length) | |
{ | |
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1); | |
return (IntPtr)(((length & (Vector<byte>.Count - 1)) + unaligned) & (Vector<byte>.Count - 1)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment