Skip to content

Instantly share code, notes, and snippets.

@DBalashov
Created December 8, 2024 12:46
Show Gist options
  • Save DBalashov/4ad2d64228bfc26c7539c1d2187a7d91 to your computer and use it in GitHub Desktop.
Save DBalashov/4ad2d64228bfc26c7539c1d2187a7d91 to your computer and use it in GitHub Desktop.
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
// | Method | Categories | N | Mean | Error | StdDev | Ratio |
// |------------------- |----------- |------- |--------------:|-----------:|-----------:|------:|
// | r2_double_simple | double | 256 | 328.16 ns | 0.666 ns | 0.623 ns | 1.00 |
// | r2_double_SIMD | double | 256 | 83.81 ns | 0.194 ns | 0.172 ns | 0.26 |
// | r2_double_SIMD_512 | double | 256 | 57.43 ns | 0.121 ns | 0.113 ns | 0.18 |
// | | | | | | | |
// | r2_double_simple | double | 1024 | 1,306.46 ns | 2.553 ns | 2.388 ns | 1.00 |
// | r2_double_SIMD | double | 1024 | 332.19 ns | 0.553 ns | 0.490 ns | 0.25 |
// | r2_double_SIMD_512 | double | 1024 | 211.49 ns | 0.684 ns | 0.640 ns | 0.16 |
// | | | | | | | |
// | r2_double_simple | double | 8192 | 10,447.47 ns | 18.444 ns | 16.350 ns | 1.00 |
// | r2_double_SIMD | double | 8192 | 2,640.74 ns | 9.755 ns | 9.125 ns | 0.25 |
// | r2_double_SIMD_512 | double | 8192 | 1,744.60 ns | 15.390 ns | 14.396 ns | 0.17 |
// | | | | | | | |
// | r2_double_simple | double | 32768 | 42,124.57 ns | 129.262 ns | 120.912 ns | 1.00 |
// | r2_double_SIMD | double | 32768 | 10,619.25 ns | 25.535 ns | 22.636 ns | 0.25 |
// | r2_double_SIMD_512 | double | 32768 | 7,215.71 ns | 22.042 ns | 20.618 ns | 0.17 |
// | | | | | | | |
// | r2_double_simple | double | 262144 | 336,632.37 ns | 433.968 ns | 338.814 ns | 1.00 |
// | r2_double_SIMD | double | 262144 | 86,340.07 ns | 358.094 ns | 334.962 ns | 0.26 |
// | r2_double_SIMD_512 | double | 262144 | 60,434.60 ns | 286.446 ns | 267.941 ns | 0.18 |
// | | | | | | | |
// | r2_simple_float | float | 256 | 330.74 ns | 0.494 ns | 0.438 ns | 1.00 |
// | r2_SIMD_float | float | 256 | 48.32 ns | 0.101 ns | 0.094 ns | 0.15 |
// | r2_SIMD_float_512 | float | 256 | 35.79 ns | 0.077 ns | 0.064 ns | 0.11 |
// | | | | | | | |
// | r2_simple_float | float | 1024 | 1,315.37 ns | 2.455 ns | 2.177 ns | 1.00 |
// | r2_SIMD_float | float | 1024 | 173.25 ns | 0.367 ns | 0.343 ns | 0.13 |
// | r2_SIMD_float_512 | float | 1024 | 108.02 ns | 0.433 ns | 0.405 ns | 0.08 |
// | | | | | | | |
// | r2_simple_float | float | 8192 | 10,460.41 ns | 24.251 ns | 22.685 ns | 1.00 |
// | r2_SIMD_float | float | 8192 | 1,329.44 ns | 2.635 ns | 2.201 ns | 0.13 |
// | r2_SIMD_float_512 | float | 8192 | 865.05 ns | 3.402 ns | 3.182 ns | 0.08 |
// | | | | | | | |
// | r2_simple_float | float | 32768 | 42,468.47 ns | 413.560 ns | 386.844 ns | 1.00 |
// | r2_SIMD_float | float | 32768 | 5,271.61 ns | 21.083 ns | 18.690 ns | 0.12 |
// | r2_SIMD_float_512 | float | 32768 | 3,465.70 ns | 6.359 ns | 5.637 ns | 0.08 |
// | | | | | | | |
// | r2_simple_float | float | 262144 | 335,978.50 ns | 537.433 ns | 476.421 ns | 1.00 |
// | r2_SIMD_float | float | 262144 | 43,055.83 ns | 155.537 ns | 145.490 ns | 0.13 |
// | r2_SIMD_float_512 | float | 262144 | 29,530.19 ns | 80.202 ns | 75.021 ns | 0.09 |
#pragma warning disable CS8618
var summary = BenchmarkRunner.Run<MainTest>();
[SimpleJob(RuntimeMoniker.Net90), GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory), CategoriesColumn]
// [WarmupCount(2)]
// [IterationCount(2)]
// [MemoryDiagnoser]
public class MainTest
{
[Params(256, 1024, 8 * 1024, 32 * 1024, 256 * 1024)]
public int N { get; set; }
[GlobalSetup]
public void GlobalSetup()
{
arr1 = Enumerable.Range(0, N).Select(_ => Random.Shared.NextDouble()).ToArray();
arr2 = Enumerable.Range(0, N).Select(_ => Random.Shared.NextDouble()).ToArray();
arr1f = Enumerable.Range(0, N).Select(_ => (float) Random.Shared.NextDouble()).ToArray();
arr2f = Enumerable.Range(0, N).Select(_ => (float) Random.Shared.NextDouble()).ToArray();
}
double[] arr1;
double[] arr2;
float[] arr1f;
float[] arr2f;
#region double
[Benchmark(Baseline = true), BenchmarkCategory("double")]
public double r2_double_simple() => r2_simple_double(arr1, arr2);
[Benchmark, BenchmarkCategory("double")]
public double r2_double_SIMD() => r2_SIMD_double(arr1, arr2);
[Benchmark, BenchmarkCategory("double")]
public double r2_double_SIMD_512() => r2_SIMD_double_512(arr1, arr2);
static double r2_simple_double(double[] fact, double[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var avg = 0.0;
foreach (var t in fact)
avg += t;
avg /= fact.Length;
var sstot = 0.0;
var ssres = 0.0;
for (var i = 0; i < fact.Length; i++)
{
var tot = fact[i] - avg;
sstot += tot * tot;
var res = fact[i] - prog[i];
ssres += res * res;
}
return 1 - ssres / sstot;
}
static double r2_SIMD_double(double[] fact, double[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var vfact = MemoryMarshal.Cast<double, Vector256<double>>(fact);
var vprog = MemoryMarshal.Cast<double, Vector256<double>>(prog);
var vavg = Vector256<double>.Zero;
foreach (var v in vfact)
vavg += v;
var avg = vavg[0] + vavg[1] + vavg[2] + vavg[3];
for (var i = vfact.Length * 4; i < fact.Length; i++)
avg += fact[i];
avg /= fact.Length;
vavg = Vector256.Create(avg);
var vtotfact = Vector256<double>.Zero;
var vresfact = Vector256<double>.Zero;
for (var i = 0; i < vfact.Length; i++)
{
var vf = vfact[i];
var factDiff = vf - vavg;
vtotfact += factDiff * factDiff;
var resDiff = vf - vprog[i];
vresfact += resDiff * resDiff;
}
var sstot = vtotfact[0] + vtotfact[1] + vtotfact[2] + vtotfact[3];
var ssres = vresfact[0] + vresfact[1] + vresfact[2] + vresfact[3];
for (var i = vfact.Length * 4; i < fact.Length; i++)
{
var factDiff = fact[i] - avg;
sstot += factDiff * factDiff;
var resDiff = fact[i] - prog[i];
ssres += resDiff * resDiff;
}
return 1 - ssres / sstot;
}
static double r2_SIMD_double_512(double[] fact, double[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var vfact = MemoryMarshal.Cast<double, Vector512<double>>(fact);
var vprog = MemoryMarshal.Cast<double, Vector512<double>>(prog);
var vavg = Vector256<double>.Zero;
foreach (var v in vfact)
vavg += v.GetLower() + v.GetUpper();
var avg = vavg[0] + vavg[1] + vavg[2] + vavg[3];
for (var i = vfact.Length * 8; i < fact.Length; i++)
avg += fact[i];
avg /= fact.Length;
vavg = Vector256.Create(avg);
var vtotfact = Vector256<double>.Zero;
var vresfact = Vector256<double>.Zero;
for (var i = 0; i < vfact.Length; i++)
{
var vfLower = vfact[i].GetLower();
var vfUpper = vfact[i].GetUpper();
var factDiffLow = vfLower - vavg;
var factDiffUpper = vfUpper - vavg;
vtotfact += factDiffUpper * factDiffUpper + factDiffLow * factDiffLow;
var resDiffLow = vfLower - vprog[i].GetLower();
var resDiffUpper = vfUpper - vprog[i].GetUpper();
vresfact += resDiffLow * resDiffLow + resDiffUpper * resDiffUpper;
}
var sstot = vtotfact[0] + vtotfact[1] + vtotfact[2] + vtotfact[3];
var ssres = vresfact[0] + vresfact[1] + vresfact[2] + vresfact[3];
for (var i = vfact.Length * 8; i < fact.Length; i++)
{
var factDiff = fact[i] - avg;
sstot += factDiff * factDiff;
var resDiff = fact[i] - prog[i];
ssres += resDiff * resDiff;
}
return 1 - ssres / sstot;
}
#endregion
#region float
[Benchmark(Baseline = true), BenchmarkCategory("float")]
public double r2_simple_float() => r2_simple_float(arr1f, arr2f);
[Benchmark, BenchmarkCategory("float")]
public double r2_SIMD_float() => r2_SIMD_float(arr1f, arr2f);
[Benchmark, BenchmarkCategory("float")]
public double r2_SIMD_float_512() => r2_SIMD_float_512(arr1f, arr2f);
static double r2_simple_float(float[] fact, float[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var avg = 0.0f;
foreach (var t in fact)
avg += t;
avg /= fact.Length;
var sstot = 0.0;
var ssres = 0.0;
for (var i = 0; i < fact.Length; i++)
{
var tot = fact[i] - avg;
sstot += tot * tot;
var res = fact[i] - prog[i];
ssres += res * res;
}
return 1 - ssres / sstot;
}
static double r2_SIMD_float(float[] fact, float[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var vfact = MemoryMarshal.Cast<float, Vector256<float>>(fact);
var vprog = MemoryMarshal.Cast<float, Vector256<float>>(prog);
var vavg = Vector256<float>.Zero;
foreach (var v in vfact)
vavg += v;
var avg = vavg[0] + vavg[1] + vavg[2] + vavg[3] + vavg[4] + vavg[5] + vavg[6] + vavg[7];
for (var i = vfact.Length * 8; i < fact.Length; i++)
avg += fact[i];
avg /= fact.Length;
vavg = Vector256.Create(avg);
var vtotfact = Vector256<float>.Zero;
var vresfact = Vector256<float>.Zero;
for (var i = 0; i < vfact.Length; i++)
{
var vf = vfact[i];
var vp = vprog[i];
var factDiff = vf - vavg;
vtotfact += factDiff * factDiff;
var resDiff = vf - vp;
vresfact += resDiff * resDiff;
}
var sstot = vtotfact[0] + vtotfact[1] + vtotfact[2] + vtotfact[3] + vtotfact[4] + vtotfact[5] + vtotfact[6] + vtotfact[7];
var ssres = vresfact[0] + vresfact[1] + vresfact[2] + vresfact[3] + vresfact[4] + vresfact[5] + vresfact[6] + vresfact[7];
for (var i = vfact.Length * 8; i < fact.Length; i++)
{
var factDiff = fact[i] - avg;
sstot += factDiff * factDiff;
var resDiff = fact[i] - prog[i];
ssres += resDiff * resDiff;
}
return 1 - ssres / sstot;
}
static double r2_SIMD_float_512(float[] fact, float[] prog)
{
ArgumentNullException.ThrowIfNull(fact);
ArgumentNullException.ThrowIfNull(prog);
if (fact.Length != prog.Length)
throw new ArgumentException("Arrays must have the same length");
var vfact = MemoryMarshal.Cast<float, Vector512<float>>(fact);
var vprog = MemoryMarshal.Cast<float, Vector512<float>>(prog);
var vavg = Vector256<float>.Zero;
foreach (var v in vfact)
vavg += v.GetLower() + v.GetUpper();
var avg = vavg[0] + vavg[1] + vavg[2] + vavg[3] + vavg[4] + vavg[5] + vavg[6] + vavg[7];
for (var i = vfact.Length * 16; i < fact.Length; i++)
avg += fact[i];
avg /= fact.Length;
vavg = Vector256.Create(avg);
var vtotfact = Vector256<float>.Zero;
var vresfact = Vector256<float>.Zero;
for (var i = 0; i < vfact.Length; i++)
{
var vf = vfact[i];
var vfLower = vf.GetLower();
var vfUpper = vf.GetUpper();
var factDiffLow = vfLower - vavg;
var factDiffUpper = vfUpper - vavg;
vtotfact += factDiffUpper * factDiffUpper + factDiffLow * factDiffLow;
var vp = vprog[i];
var resDiffLow = vfLower - vp.GetLower();
var resDiffUpper = vfUpper - vp.GetUpper();
vresfact += resDiffLow * resDiffLow + resDiffUpper * resDiffUpper;
}
var sstot = vtotfact[0] + vtotfact[1] + vtotfact[2] + vtotfact[3] + vtotfact[4] + vtotfact[5] + vtotfact[6] + vtotfact[7];
var ssres = vresfact[0] + vresfact[1] + vresfact[2] + vresfact[3] + vresfact[4] + vresfact[5] + vresfact[6] + vresfact[7];
for (var i = vfact.Length * 16; i < fact.Length; i++)
{
var factDiff = fact[i] - avg;
sstot += factDiff * factDiff;
var resDiff = fact[i] - prog[i];
ssres += resDiff * resDiff;
}
return 1 - ssres / sstot;
}
#endregion
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment