Skip to content

Instantly share code, notes, and snippets.

@redknightlois
Last active March 6, 2019 22:43
Show Gist options
  • Save redknightlois/0c2a301006f817e41c364666a422ca63 to your computer and use it in GitHub Desktop.
Save redknightlois/0c2a301006f817e41c364666a422ca63 to your computer and use it in GitHub Desktop.
using System;
using System.Linq;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Analysers;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Columns;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Environments;
using BenchmarkDotNet.Exporters;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Validators;
using BenchmarkDotNet.Running;
using RunMode = BenchmarkDotNet.Jobs.RunMode;
using System.Diagnostics;
using System.Runtime.Intrinsics;
namespace ConsoleApp3
{
internal static class BitOps
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int TrailingZeroCount(int matches)
{
if (Bmi1.IsSupported)
{
return (int)Bmi1.TrailingZeroCount((uint)matches);
}
else // Software fallback
{
// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
// uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
return Unsafe.AddByteOffset(
ref MemoryMarshal.GetReference(TrailingCountMultiplyDeBruijn),
new IntPtr(((uint)((matches & -matches) * 0x077CB531U)) >> 27));
}
}
private static ReadOnlySpan<byte> TrailingCountMultiplyDeBruijn => new byte[32]
{
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
}
internal static class Bits
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int TrailingZeroesInBytes(ulong value)
{
// uint.MaxValue >> 58 is always in range [0 - 63] so we use Unsafe.AddByteOffset to avoid bounds check
return Unsafe.AddByteOffset(
ref MemoryMarshal.GetReference(DeBruijnBytePos64),
new IntPtr((int)((value & (ulong)(-(long)value)) * 0x0218A392CDABBD3FUL) >> 58));
}
private static ReadOnlySpan<byte> DeBruijnBytePos64 => new byte[64]
{
0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7,
7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
};
}
class Program
{
static void Main(string[] args)
{
Console.WriteLine($"{nameof(Sse)} support: {Sse.IsSupported}");
Console.WriteLine($"{nameof(Sse2)} support: {Sse2.IsSupported}");
Console.WriteLine($"{nameof(Sse3)} support: {Sse3.IsSupported}");
Console.WriteLine($"{nameof(Sse41)} support: {Sse41.IsSupported}");
Console.WriteLine($"{nameof(Avx)} support: {Avx.IsSupported}");
Console.WriteLine($"{nameof(Avx2)} support: {Avx2.IsSupported}");
//var p = new Compare();
//p.KeySize = 4096 * 4096 - 1;
//p.Setup();
//for (int i = 0; i < 100; i++)
// p.WithCacheMisses_Framework_SequenceCompareTo_Alternative();
//for (int i = 0; i < 100; i++)
// p.WithCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8();
BenchmarkRunner.Run<Compare>();
}
}
//[HardwareCounters(HardwareCounter.CacheMisses, HardwareCounter.TotalIssues, HardwareCounter.BranchMispredictions, HardwareCounter.InstructionRetired )]
//[DisassemblyDiagnoser]
[Config(typeof(Compare.Config))]
public unsafe class Compare
{
private class Config : ManualConfig
{
public Config()
{
Add(new Job(RunMode.Default)
{
Environment =
{
Runtime = Runtime.Core,
Platform = Platform.X64,
Jit = Jit.RyuJit
}
});
// Exporters for data
Add(GetExporters().ToArray());
// Generate plots using R if %R_HOME% is correctly set
Add(RPlotExporter.Default);
Add(StatisticColumn.AllStatistics);
Add(BaselineValidator.FailOnError);
Add(JitOptimizationsValidator.FailOnError);
Add(EnvironmentAnalyser.Default);
}
}
//[Params(7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 1024, 2048, 4096, 4096 * 16, 4096 * 4096)]
//[Params(15, 31, 39, 48, 50, 63, 90, 117, 127, 190, 255, 256)]
//[Params(7, 8, 15, 16, 24, 31, 32, 64, 127, 128, 256, 1024, 2048, 4096)]
//[Params(7, 15, 16, 27, 32, 33, 4096, 4096 * 16, 4096 * 4096)]
[Params(63, 4095, 4096 * 16 - 1)]
public int KeySize { get; set; }
public const int Operations = 100;
private int size = 1024 * 1024 * 100;
private byte* source;
private byte* destination;
private int[] randomLocation;
private int[] sameLocation;
public const int VectorBytes = 32;
[GlobalSetup]
public void Setup()
{
if (Vector<byte>.Count != VectorBytes)
throw new NotSupportedException("");
source = (byte*)Marshal.AllocHGlobal(size).ToPointer();
destination = (byte*)Marshal.AllocHGlobal(size).ToPointer();
var r = new Random();
for (int i = 0; i < size; i++)
{
if (size % KeySize == KeySize - 1)
{
source[i] = 0;
destination[i] = 1;
}
else
{
int b = r.Next();
source[i] = (byte)b;
destination[i] = (byte)b;
}
}
randomLocation = new int[Operations];
sameLocation = new int[Operations];
int range = (size / KeySize) - 1;
for (int i = 0; i < randomLocation.Length; i++)
{
randomLocation[i] = r.Next(range) * KeySize;
sameLocation[i] = 0;
}
}
//[Benchmark(OperationsPerInvoke = Operations)]
public int WithCacheMisses_Scalar()
{
int r = 0;
foreach (int index in randomLocation)
r += CompareOriginal(source + index, destination + index, KeySize);
return r;
}
//[Benchmark(OperationsPerInvoke = Operations)]
public int WithCacheMisses_Framework_MemoryExtensions_Reference()
{
int r = 0;
foreach (int index in randomLocation)
r += MemoryExtensions.SequenceCompareTo(new ReadOnlySpan<byte>(source + index, KeySize), new ReadOnlySpan<byte>(destination + index, KeySize));
return r;
}
[Benchmark(OperationsPerInvoke = Operations)]
public int WithCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8()
{
int r = 0;
foreach (int index in randomLocation)
r += SequenceCompareTo_Ben5f6dcb8(ref *(source + index), KeySize, ref *(destination + index), KeySize);
return r;
}
[Benchmark(OperationsPerInvoke = Operations)]
public int WithCacheMisses_Framework_SequenceCompareTo_Alternative()
{
int r = 0;
foreach (int index in randomLocation)
r += SequenceCompareTo_Alternative(ref *(source + index), KeySize, ref *(destination + index), KeySize);
return r;
}
//[Benchmark(OperationsPerInvoke = Operations)]
public int WithCacheMisses_Numerics32()
{
int r = 0;
foreach (int index in randomLocation)
r += CompareNumerics(source + index, destination + index, KeySize);
return r;
}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_ScalarXor()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareScalarAltDeBrujain(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_ScalarCmpXorPopCount()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareScalarCmpAltPopCount(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_ScalarCmpXorPopCount_Prefetch()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareScalarCmpAltPopCount_Prefetch(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_ScalarAlt2()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareScalarAlt2(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_ScalarAlt3()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareScalarAlt3(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark(OperationsPerInvoke = Operations)]
//public int WithCacheMisses_NumericsAlt32()
//{
// int r = 0;
// foreach (int index in randomLocation)
// r += CompareNumericsAlt(source + index, destination + index, KeySize);
// return r;
//}
//[Benchmark]
public int NoCacheMisses_Scalar()
{
return CompareOriginal(source, destination, KeySize);
}
//[Benchmark]
public int NoCacheMisses_Framework_MemoryExtensions_Reference()
{
return MemoryExtensions.SequenceCompareTo(new Span<byte>(source, KeySize), new Span<byte>(destination, KeySize));
}
[Benchmark]
public int NoCacheMisses_Framework_SequenceCompareTo_Internal_Ben5f6dcb8()
{
return SequenceCompareTo_Ben5f6dcb8(ref *(source), KeySize, ref *(destination), KeySize);
}
[Benchmark]
public int NoCacheMisses_Framework_SequenceCompareTo_Alternative()
{
return SequenceCompareTo_Alternative(ref *(source), KeySize, ref *(destination), KeySize);
}
//[Benchmark]
//public int NoCacheMisses_ScalarXor()
//{
// return CompareScalarAltDeBrujain(source, destination, KeySize);
//}
//[Benchmark]
//public int NoCacheMisses_ScalarXorPopCount()
//{
// return CompareScalarAltPopCount(source, destination, KeySize);
//}
//[Benchmark]
//public int NoCacheMisses_ScalarCmpXorPopCount()
//{
// return CompareScalarCmpAltPopCount(source, destination, KeySize);
//}
//[Benchmark]
//public int NoCacheMisses_ScalarCmpXorPopCount_Prefetch()
//{
// return CompareScalarCmpAltPopCount_Prefetch(source, destination, KeySize);
//}
//[Benchmark]
public int NoCacheMisses_Numerics32()
{
return CompareNumerics(source, destination, KeySize);
}
//[Benchmark]
//public int NoCacheMisses_NumericsAlt32()
//{
// return CompareNumericsAlt(source, destination, KeySize);
//}
private static int CompareScalarAltPopCount(void* p1, void* p2, int size)
{
byte* bpx = (byte*)p1;
byte* bpy = (byte*)p2;
// PERF: This allows us to do pointer arithmetics and use relative addressing using the
// hardware instructions without needed an extra register.
long offset = bpy - bpx;
if (size < 8)
goto ProcessSmall;
// PERF: Current version of the JIT (2.0.5) will use a 4 instruction magic division
// instead of a simple shift because it is a power of 2 dividend.
int l = size >> 3; // (Equivalent to size / 8)
ulong xor;
for (int i = 0; i < l; i++, bpx += 8)
{
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]```
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset);
if (xor != 0)
goto Tail;
}
ProcessSmall:
if ((size & 4) != 0)
{
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset));
if (xor != 0)
goto Tail;
bpx += 4;
}
if ((size & 2) != 0)
{
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset)));
if (xor != 0)
goto Tail;
bpx += 2;
}
if ((size & 1) != 0)
{
return *bpx - *(bpx + offset);
}
return 0;
Tail:
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference,
// we can use that we know we are running on little endian hardware and the very first bit set
// will correspond to the first byte which is different.
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3;
return *bpx - *(bpx + offset);
}
private static int CompareScalarCmpAltPopCount_Prefetch(void* p1, void* p2, int size)
{
Sse.Prefetch2(p1);
Sse.Prefetch2(p2);
byte* bpx = (byte*)p1;
// PERF: This allows us to do pointer arithmetics and use relative addressing using the
// hardware instructions without needed an extra register.
long offset = (byte*)p2 - bpx;
if ((size & 7) == 0)
goto ProcessAligned;
// We process first the "unaligned" size.
ulong xor;
if ((size & 4) != 0)
{
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset));
if (xor != 0)
goto Tail;
bpx += 4;
}
if ((size & 2) != 0)
{
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset)));
if (xor != 0)
goto Tail;
bpx += 2;
}
if ((size & 1) != 0)
{
int value = *bpx - *(bpx + offset);
if (value != 0)
return value;
bpx += 1;
}
ProcessAligned:
byte* end = (byte*)p1 + size;
byte* loopEnd = end - 16;
while (bpx <= loopEnd)
{
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]```
if (*((ulong*)bpx) != *(ulong*)(bpx + offset))
goto XorTail;
if (*((ulong*)(bpx + 8)) != *(ulong*)(bpx + 8 + offset))
{
bpx += 8;
goto XorTail;
}
bpx += 16;
}
if (bpx < end)
goto XorTail;
return 0;
XorTail:
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset);
Tail:
// Fast-path for equals
if (xor == 0)
return 0;
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference,
// we can use that we know we are running on little endian hardware and the very first bit set
// will correspond to the first byte which is different.
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3;
return *bpx - *(bpx + offset);
}
private static int CompareScalarCmpAltPopCount(void* p1, void* p2, int size)
{
byte* bpx = (byte*)p1;
// PERF: This allows us to do pointer arithmetics and use relative addressing using the
// hardware instructions without needed an extra register.
long offset = (byte*)p2 - bpx;
if ((size & 7) == 0)
goto ProcessAligned;
// We process first the "unaligned" size.
ulong xor;
if ((size & 4) != 0)
{
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset));
if (xor != 0)
goto Tail;
bpx += 4;
}
if ((size & 2) != 0)
{
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset)));
if (xor != 0)
goto Tail;
bpx += 2;
}
if ((size & 1) != 0)
{
int value = *bpx - *(bpx + offset);
if (value != 0)
return value;
bpx += 1;
}
ProcessAligned:
byte* end = (byte*)p1 + size;
byte* loopEnd = end - 16;
while (bpx <= loopEnd)
{
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]```
if (*((ulong*)bpx) != *(ulong*)(bpx + offset))
goto XorTail;
if (*((ulong*)(bpx + 8)) != *(ulong*)(bpx + 8 + offset))
{
bpx += 8;
goto XorTail;
}
bpx += 16;
}
if (bpx < end)
goto XorTail;
return 0;
XorTail: xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset);
Tail:
// Fast-path for equals
if (xor == 0)
return 0;
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference,
// we can use that we know we are running on little endian hardware and the very first bit set
// will correspond to the first byte which is different.
bpx += Popcnt.X64.PopCount((ulong)((long)xor & -(long)xor) - 1) >> 3;
return *bpx - *(bpx + offset);
}
private static int CompareScalarAltDeBrujain(void* p1, void* p2, int size)
{
byte* bpx = (byte*)p1;
// PERF: This allows us to do pointer arithmetics and use relative addressing using the
// hardware instructions without needed an extra register.
long offset = (byte*)p2 - bpx;
if (size < 8)
goto ProcessSmall;
// PERF: Current version of the JIT (2.0.5) will use a 4 instruction magic division
// instead of a simple shift because it is a power of 2 dividend.
int l = size >> 3; // (Equivalent to size / 8)
ulong xor;
for (int i = 0; i < l; i++, bpx += 8)
{
// PERF: JIT will emit: ```{op} {reg}, qword ptr [rdx+rax]```
xor = *((ulong*)bpx) ^ *(ulong*)(bpx + offset);
if (xor != 0)
goto Tail;
}
ProcessSmall:
if ((size & 4) != 0)
{
xor = *((uint*)bpx) ^ *((uint*)(bpx + offset));
if (xor != 0)
goto Tail;
bpx += 4;
}
if ((size & 2) != 0)
{
xor = (ulong)(*((ushort*)bpx) ^ *((ushort*)(bpx + offset)));
if (xor != 0)
goto Tail;
bpx += 2;
}
if ((size & 1) != 0)
{
return *bpx - *(bpx + offset);
}
return 0;
Tail:
// PERF: This is a bit twiddling hack. Given that bitwise xoring 2 values flag the bits difference,
// we can use that we know we are running on little endian hardware and the very first bit set
// will correspond to the first byte which is different.
bpx += Bits.TrailingZeroesInBytes(xor);
return *bpx - *(bpx + offset);
}
private static int CompareScalarAlt2(void* p1, void* p2, int size)
{
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly.
// If you know you will be comparing big memory chunks do not use the inline version.
byte* bpx = (byte*)p1;
byte* bpy = (byte*)p2;
long offset = bpy - bpx;
if (size < 8)
goto ProcessSmall;
int l = size >> 3; // (Equivalent to size / 8)
for (int i = 0; i < l; i++, bpx += 8)
{
if (*((long*)bpx) != *((long*)(bpx + offset)))
{
// We reset the size to account for knowing that we are performing this test on 4 bytes only
size = 4;
goto ProcessWord;
}
}
goto ProcessSmall;
ProcessWord:
// We know that we have 8 valid bytes to process and they are different somewhere.
// We then check if half of it is different.
if (*((int*)bpx) == *((int*)(bpx + offset)))
bpx += 4;
ProcessSmall:
if ((size & 4) != 0)
{
if (*((int*)bpx) == *((int*)(bpx + offset)))
{
bpx += 4;
}
}
if ((size & 2) != 0)
{
if (*((short*)bpx) == *((short*)(bpx + offset)))
{
bpx += 2;
}
}
if ((size & 1) != 0)
{
return *bpx - *(bpx + offset);
}
return 0;
}
private static int CompareScalarAlt3(void* p1, void* p2, int size)
{
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly.
// If you know you will be comparing big memory chunks do not use the inline version.
byte* bpx = (byte*)p1;
byte* bpy = (byte*)p2;
long offset = bpy - bpx;
if (size < 8)
goto ProcessSmall;
int l = size >> 3; // (Equivalent to size / 8)
for (int i = 0; i < l; i++, bpx += 8)
{
if (*((long*)bpx) != *((long*)(bpx + offset)))
break;
}
// We reset the size to account for knowing that we are performing this test on 4 bytes only
size = 4;
if (*((int*)bpx) == *((int*)(bpx + offset)))
bpx += 4;
ProcessSmall:
if ((size & 4) != 0)
{
if (*((int*)bpx) == *((int*)(bpx + offset)))
{
bpx += 4;
}
}
if (((byte)size & 2) != 0)
{
if (*((short*)bpx) == *((short*)(bpx + offset)))
{
bpx += 2;
}
}
if (((byte)size & 1) != 0)
{
return *bpx - *(bpx + offset);
}
return 0;
}
private static int CompareOriginal(void* p1, void* p2, int size)
{
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly.
// If you know you will be comparing big memory chunks do not use the inline version.
int l = size;
byte* bpx = (byte*)p1, bpy = (byte*)p2;
int last;
for (int i = 0; i < l / 8; i++, bpx += 8, bpy += 8)
{
if (*((long*)bpx) != *((long*)bpy))
{
last = 8;
goto Tail;
}
}
if ((l & 4) != 0)
{
if (*((int*)bpx) != *((int*)bpy))
{
last = 4;
goto Tail;
}
bpx += 4;
bpy += 4;
}
if ((l & 2) != 0)
{
if (*((short*)bpx) != *((short*)bpy))
{
last = 2;
goto Tail;
}
bpx += 2;
bpy += 2;
}
if ((l & 1) != 0)
{
return (*((byte*)bpx) - *((byte*)bpy));
}
return 0;
Tail:
while (last > 0)
{
if (*((byte*)bpx) != *((byte*)bpy))
return *bpx - *bpy;
bpx++;
bpy++;
last--;
}
return 0;
}
private static int CompareNumericsAlt(void* p1, void* p2, int size)
{
byte* bpx = (byte*)p1, bpy = (byte*)p2;
byte* end = bpx + size;
byte* currentEnd = end - (size & (32 - 1));
while (bpx < currentEnd)
{
var vx = Unsafe.Read<Vector<byte>>(bpx);
var vy = Unsafe.Read<Vector<byte>>(bpy);
var xor = Vector.Xor(vx, vy);
if (xor == Vector<byte>.Zero)
break;
bpx += 32;
bpy += 32;
}
currentEnd = end - (size & (sizeof(long) - 1));
while (bpx < currentEnd)
{
ulong vx = ((ulong*)bpx)[0];
ulong vy = ((ulong*)bpy)[0];
if (vx != vy)
break;
bpx += 8;
bpy += 8;
}
while (bpx < end)
{
int r = *bpx - *bpy;
if (r != 0)
return r;
bpx += 1;
bpy += 1;
}
return 0;
}
private static int CompareNumerics(void* p1, void* p2, int size)
{
byte* bpx = (byte*)p1, bpy = (byte*)p2;
// If we use an unmanaged bulk version with an inline compare the caller site does not get optimized properly.
// If you know you will be comparing big memory chunks do not use the inline version.
int l = size / VectorBytes; // This should translate into a shift operation.
size -= l * VectorBytes; // This should translate into a shift operation.
while (l > 0)
{
var vx = Unsafe.Read<Vector<byte>>(bpx);
var vy = Unsafe.Read<Vector<byte>>(bpy);
var xor = Vector.Xor(vx, vy);
if (xor != Vector<byte>.Zero)
break;
l--;
bpx += VectorBytes;
bpy += VectorBytes;
}
if (size <= 8)
goto Last;
if (size > 8 && ((long*)bpx)[0] != ((long*)bpy)[0])
goto Last;
if (size > 16 && ((long*)bpx)[1] != ((long*)bpy)[1])
goto Last;
if (size > 24 && ((long*)bpx)[2] != ((long*)bpy)[2])
goto Last;
if (size == 32 && ((long*)bpx)[3] != ((long*)bpy)[3])
goto Last;
return 0;
Last:
size %= 8; // This should translate to a AND operation.
int last = 0;
while (size > 0)
{
int r = bpx[last] - bpy[last];
if (r != 0)
return r;
size--;
last++;
}
return 0;
}
public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref byte second, int secondLength)
{
Debug.Assert(firstLength >= 0);
Debug.Assert(secondLength >= 0);
if (Unsafe.AreSame(ref first, ref second))
goto Equal;
IntPtr minLength = (IntPtr)((firstLength < secondLength) ? firstLength : secondLength);
IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
IntPtr nLength = (IntPtr)(void*)minLength;
if (Avx2.IsSupported)
{
if ((byte*)nLength >= (byte*)Vector256<byte>.Count)
{
nLength -= Vector256<byte>.Count;
int matches;
while ((byte*)nLength > (byte*)offset)
{
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
if (matches == -1)
{
// All matched
offset += Vector256<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
if (matches == -1)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
if ((byte*)nLength >= (byte*)Vector128<byte>.Count)
{
nLength -= Vector128<byte>.Count;
int matches;
if ((byte*)nLength > (byte*)offset)
{
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
offset += Vector128<byte>.Count;
}
else
{
goto Difference;
}
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
}
else if (Sse2.IsSupported)
{
if ((byte*)nLength >= (byte*)Vector128<byte>.Count)
{
nLength -= Vector128<byte>.Count;
int matches;
while ((byte*)nLength > (byte*)offset)
{
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
offset += Vector128<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
}
else if (Vector.IsHardwareAccelerated)
{
if ((byte*)nLength > (byte*)Vector<byte>.Count)
{
nLength -= Vector<byte>.Count;
while ((byte*)nLength > (byte*)offset)
{
if (LoadVector(ref first, offset) != LoadVector(ref second, offset))
{
goto NotEqual;
}
offset += Vector<byte>.Count;
}
goto NotEqual;
}
}
if ((byte*)nLength > (byte*)sizeof(UIntPtr))
{
nLength -= sizeof(UIntPtr);
while ((byte*)nLength > (byte*)offset)
{
if (LoadUIntPtr(ref first, offset) != LoadUIntPtr(ref second, offset))
{
goto NotEqual;
}
offset += sizeof(UIntPtr);
}
}
NotEqual: // Workaround for https://github.com/dotnet/coreclr/issues/13549
while ((byte*)minLength > (byte*)offset)
{
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
if (result != 0)
return result;
offset += 1;
}
Equal:
return firstLength - secondLength;
}
public static unsafe int SequenceCompareTo_Ben5f6dcb8(ref byte first, int firstLength, ref byte second, int secondLength)
{
Debug.Assert(firstLength >= 0);
Debug.Assert(secondLength >= 0);
if (Unsafe.AreSame(ref first, ref second))
goto Equal;
IntPtr minLength = (IntPtr)((firstLength < secondLength) ? firstLength : secondLength);
IntPtr offset = (IntPtr)0; // Use IntPtr for arithmetic to avoid unnecessary 64->32->64 truncations
IntPtr nLength = (IntPtr)(void*)minLength;
if (Avx2.IsSupported)
{
if ((byte*)nLength >= (byte*)Vector256<byte>.Count)
{
nLength -= Vector256<byte>.Count;
int matches;
while ((byte*)nLength > (byte*)offset)
{
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
if (matches == -1)
{
// All matched
offset += Vector256<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, offset), LoadVector256(ref second, offset)));
if (matches == -1)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
if ((byte*)nLength >= (byte*)Vector128<byte>.Count)
{
nLength -= Vector128<byte>.Count;
int matches;
if ((byte*)nLength > (byte*)offset)
{
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
offset += Vector128<byte>.Count;
}
else
{
goto Difference;
}
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
}
else if (Sse2.IsSupported)
{
if ((byte*)nLength >= (byte*)Vector128<byte>.Count)
{
nLength -= Vector128<byte>.Count;
int matches;
while ((byte*)nLength > (byte*)offset)
{
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
offset += Vector128<byte>.Count;
continue;
}
goto Difference;
}
// Move to Vector length from end for final compare
offset = nLength;
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, offset), LoadVector128(ref second, offset)));
if (matches == 0xFFFF)
{
// All matched
goto Equal;
}
Difference:
// Invert matches to find differences
int differences = ~matches;
offset = (IntPtr)((int)(byte*)offset + BitOps.TrailingZeroCount(differences));
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
Debug.Assert(result != 0);
return result;
}
}
else if (Vector.IsHardwareAccelerated)
{
if ((byte*)nLength > (byte*)Vector<byte>.Count)
{
nLength -= Vector<byte>.Count;
while ((byte*)nLength > (byte*)offset)
{
if (LoadVector(ref first, offset) != LoadVector(ref second, offset))
{
goto BytewiseCheck;
}
offset += Vector<byte>.Count;
}
goto BytewiseCheck;
}
}
if ((byte*)nLength > (byte*)sizeof(UIntPtr))
{
nLength -= sizeof(UIntPtr);
while ((byte*)nLength > (byte*)offset)
{
if (LoadUIntPtr(ref first, offset) != LoadUIntPtr(ref second, offset))
{
goto BytewiseCheck;
}
offset += sizeof(UIntPtr);
}
}
BytewiseCheck: // Workaround for https://github.com/dotnet/coreclr/issues/13549
while ((byte*)minLength > (byte*)offset)
{
int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
if (result != 0)
return result;
offset += 1;
}
Equal:
return firstLength - secondLength;
}
public static unsafe int SequenceCompareTo_Alternative(ref byte firstOriginal, int firstLength, ref byte secondOriginal, int secondLength)
{
Debug.Assert(firstLength >= 0);
Debug.Assert(secondLength >= 0);
if (Unsafe.AreSame(ref firstOriginal, ref secondOriginal))
goto Equal;
long offset = ((firstLength < secondLength) ? firstLength : secondLength);
if (Avx2.IsSupported)
{
ref byte first = ref Unsafe.AddByteOffset(ref firstOriginal, (IntPtr)offset);
ref byte second = ref Unsafe.AddByteOffset(ref secondOriginal, (IntPtr)offset);
// PERF: We force a sign extension to avoid the rest of the operations to hit sign extensions conversions on every use.
offset = -offset;
int matches;
while ((int)offset <= -Vector256<byte>.Count)
{
matches = Avx2.MoveMask(Avx2.CompareEqual(LoadVector256(ref first, (IntPtr)offset), LoadVector256(ref second, (IntPtr)offset)));
if (matches == -1)
{
// All matched
offset += Vector256<byte>.Count;
}
else goto Difference;
}
if ((int)offset <= -Vector128<byte>.Count)
{
matches = Sse2.MoveMask(Sse2.CompareEqual(LoadVector128(ref first, (IntPtr)offset), LoadVector128(ref second, (IntPtr)offset)));
matches ^= 0x0000FFFF;
if (matches != 0)
goto Difference;
offset += Vector128<byte>.Count;
}
if ((int)offset <= -sizeof(long))
{
ulong xor = (Unsafe.ReadUnaligned<ulong>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<ulong>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset)));
if (xor != 0)
{
matches = (int)(xor >> sizeof(int));
goto DifferenceExtract;
}
offset += sizeof(long);
}
if ((int)offset <= -sizeof(int))
{
matches = (int)(Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<uint>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset)));
if (matches != 0)
goto DifferenceExtract;
offset += sizeof(int);
}
if ((int)offset <= -sizeof(short))
{
matches = (int)(Unsafe.ReadUnaligned<short>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) ^ Unsafe.ReadUnaligned<short>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset)));
if (matches != 0)
{
matches <<= sizeof(int) - sizeof(short);
goto DifferenceExtract;
}
offset += sizeof(short);
}
if ((int)offset <= -sizeof(byte))
{
return Unsafe.ReadUnaligned<byte>(ref Unsafe.AddByteOffset(ref first, (IntPtr)offset)) - Unsafe.ReadUnaligned<byte>(ref Unsafe.AddByteOffset(ref second, (IntPtr)offset));
}
return 0;
DifferenceExtract:
offset = (int)(Bmi1.ExtractLowestSetBit((uint)matches));
Difference:
int result = Unsafe.AddByteOffset(ref first, (IntPtr)offset).CompareTo(Unsafe.AddByteOffset(ref second, (IntPtr)offset));
if (result != 0)
return result;
return 0;
}
else if (Sse2.IsSupported)
{
throw new NotImplementedException();
}
else if (Vector.IsHardwareAccelerated)
{
throw new NotImplementedException();
}
Equal:
return firstLength - secondLength;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe UIntPtr LoadUIntPtr(ref byte start, IntPtr offset)
=> Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe Vector<byte> LoadVector(ref byte start, IntPtr offset)
=> Unsafe.ReadUnaligned<Vector<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe Vector128<byte> LoadVector128(ref byte start, IntPtr offset)
=> Unsafe.ReadUnaligned<Vector128<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe Vector256<byte> LoadVector256(ref byte start, IntPtr offset)
=> Unsafe.ReadUnaligned<Vector256<byte>>(ref Unsafe.AddByteOffset(ref start, offset));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr GetByteVectorSpanLength(IntPtr offset, int length)
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr GetByteVector128SpanLength(IntPtr offset, int length)
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector128<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr GetByteVector256SpanLength(IntPtr offset, int length)
=> (IntPtr)((length - (int)(byte*)offset) & ~(Vector256<byte>.Count - 1));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr UnalignedByteCountVector(ref byte searchSpace)
{
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1);
return (IntPtr)((Vector<byte>.Count - unaligned) & (Vector<byte>.Count - 1));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr UnalignedByteCountVector128(ref byte searchSpace)
{
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector128<byte>.Count - 1);
return (IntPtr)((Vector128<byte>.Count - unaligned) & (Vector128<byte>.Count - 1));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe IntPtr UnalignedByteCountVectorFromEnd(ref byte searchSpace, int length)
{
int unaligned = (int)Unsafe.AsPointer(ref searchSpace) & (Vector<byte>.Count - 1);
return (IntPtr)(((length & (Vector<byte>.Count - 1)) + unaligned) & (Vector<byte>.Count - 1));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment