using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;

Random r = new(71);
for (int i = 0; i < 1024; i++)
{
	Vector128<byte> A128 = default;
	Vector128<byte> B128 = default;
	Vector256<byte> A256 = default;
	Vector256<byte> B256 = default;

	r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector128<byte>>(ref A128)));
	r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector128<byte>>(ref B128)));
	r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector256<byte>>(ref A256)));
	r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector256<byte>>(ref B256)));

	var result128A = Impl.ShuffleUnsafe(A128, B128);
	var result256A = Impl.ShuffleUnsafe(A256, B256);

	B128 &= Vector128.Create((byte)0x8F);
	B256 &= Vector256.Create((byte)0x9F);

	var result128B = Vector128.Shuffle(A128, B128);
	var result256B = Vector256.Shuffle(A256, B256);

	if (result128A != result128B)
	{
		Console.WriteLine($"{i}: Error 128: {A128} | {B128} => {result128A}, expected: {result128B}");
	}
	if (result256A != result256B)
	{
		Console.WriteLine($"{i}: Error 256: {A256} | {B256} => {result256A}, expected: {result256B}");
	}
}

BenchmarkRunner.Run<Bench>();

public class Bench
{
	public Vector128<byte> A128;
	public Vector128<byte> B128;

	public Vector256<byte> A256;
	public Vector256<byte> B256;

	[GlobalSetup]
	public void Setup()
	{
		Random r = new(71);
		r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector128<byte>>(ref A128)));
		r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector128<byte>>(ref B128)));
		r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector256<byte>>(ref A256)));
		r.NextBytes(MemoryMarshal.AsBytes(new Span<Vector256<byte>>(ref B256)));
	}

	[Benchmark]
	public Vector128<byte> Shuffle128()
	{
		return Vector128.Shuffle(A128, B128);
	}
	[Benchmark]
	public Vector128<byte> ShuffleUnsafe128()
	{
		return Impl.ShuffleUnsafe(A128, B128);
	}

	[Benchmark]
	public Vector256<byte> Shuffle256()
	{
		return Vector256.Shuffle(A256, B256);
	}
	[Benchmark]
	public Vector256<byte> ShuffleUnsafe256()
	{
		return Impl.ShuffleUnsafe(A256, B256);
	}
}

public static class Impl
{
	public static Vector128<byte> ShuffleUnsafe(Vector128<byte> values, Vector128<byte> indices)
	{
		if (Ssse3.IsSupported) return Ssse3.Shuffle(values, indices);
		return Vector128.Shuffle(values, indices);
	}

	public static Vector256<byte> ShuffleUnsafe(Vector256<byte> values, Vector256<byte> indices)
	{
		if (Avx2.IsSupported)
		{
			var indicesXord = Avx2.And(Avx2.Xor(indices, Vector256.Create(Vector128.Create((byte)0), Vector128.Create((byte)0x10))), Vector256.Create((byte)0x9F));
			var swap = Avx2.Permute2x128(values, values, 0b00000001);
			var shuf1 = Avx2.Shuffle(values, indices);
			var shuf2 = Avx2.Shuffle(swap, indices);
			var selection = Avx2.CompareGreaterThan(indicesXord.AsSByte(), Vector256.Create((sbyte)0x0F)).AsByte();
			return Avx2.BlendVariable(shuf1, shuf2, selection);
		}
		return Vector256.Shuffle(values, indices);
	}
}