Last active
July 15, 2018 01:23
-
-
Save mellinoe/e8b402c65da581d93445 to your computer and use it in GitHub Desktop.
A basic example to show how to vectorize an algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Numerics; | |
class Program | |
{ | |
static unsafe void Main(string[] args) | |
{ | |
Console.WriteLine("Degree of vectorization: " + Vector<float>.Count); | |
Random rand = new Random(); | |
float[] vectorConverts = new float[50000000]; | |
for (int i = 0; i < vectorConverts.Length; i++) | |
{ | |
vectorConverts[i] = (float)rand.NextDouble(); | |
} | |
float[] scalarConverts = (float[])vectorConverts.Clone(); | |
Stopwatch swVec = Stopwatch.StartNew(); | |
ConvertAllToSrgb(vectorConverts); | |
swVec.Stop(); | |
Stopwatch swScalar = Stopwatch.StartNew(); | |
for (int i = 0; i < vectorConverts.Length; i++) | |
{ | |
scalarConverts[i] = LinearToSRGB_Scalar(scalarConverts[i]); | |
} | |
swScalar.Stop(); | |
const float maxPermissibleDelta = .000001f; | |
// Here I am just going to check the result of my vector algorithm against the results | |
// from your scalar implementation from your repo to verify that the results are correct. | |
for (int i = 0; i < vectorConverts.Length; i++) | |
{ | |
if (Math.Abs(vectorConverts[i] - scalarConverts[i]) >= maxPermissibleDelta) | |
{ | |
Console.WriteLine("Difference at " + i.ToString("00000") + $": {vectorConverts[i]}"); | |
Console.WriteLine($" != {scalarConverts[i]}"); | |
} | |
} | |
Console.Write($"Finished. Scalar time: {swScalar.ElapsedMilliseconds} ms. Vector time: {swVec.ElapsedMilliseconds} ms."); | |
} | |
// These are essentially the contants in y our formula. | |
private static readonly Vector<float> lowerLimit = new Vector<float>(0.0031308f); | |
private static readonly Vector<float> lowerScaleFactor = new Vector<float>(12.92f); | |
private static readonly Vector<float> aVec = new Vector<float>(0.055f); | |
private static readonly Vector<float> aPlusOne = new Vector<float>(1.055f); | |
private static readonly Vector<float> upperExponent = new Vector<float>(1f / 2.4f); | |
private static unsafe void ConvertAllToSrgb(float[] values) | |
{ | |
for (int i = 0; i < values.Length; i += Vector<float>.Count) | |
{ | |
Vector<float> channels = new Vector<float>(values, i); | |
Vector<int> lessOrEqual = Vector.LessThanOrEqual(channels, lowerLimit); | |
Vector<int> greater = Vector.OnesComplement(lessOrEqual); | |
/* The algorithm is split into two parts. | |
* First, we compute the values that would be the result of the "upper" part of the computation, | |
i.e. for the values that are above the cutoff threshold. | |
* Second, we compute the values from the "lower" part of the computation, i.e. the linear scalaing by 12.92f. | |
* Third, we combine the two partial results from the first and second steps by using a ConditionalSelect | |
operation, which lets us "Select" a combination of elements from the above partial results to fill in | |
a correct final result, based on the comparison results of the threshold value. | |
*/ | |
// First: let's calculate the upper portion (the complicated part) | |
Vector<float> powdVec = Pow(channels, upperExponent); // Math.Pow(signal, 1 / 2.4f) | |
Vector<float> upperSum = aPlusOne * powdVec; // (a + 1) * (Math.Pow(signal, 1/ 2.4f)) | |
Vector<float> difference = upperSum - aVec; // ((a + 1) * (Math.Pow(signal, 1/ 2.4f))) - a | |
// We need to "apply the branching" of the algorithm. The elements from the above computation | |
// should only go into the result if the original value was greater than the cutoff value. | |
// This applies the conditional mask based on whether the original elements were GREATER THAN the cutoff. | |
Vector<float> upper = Vector.ConditionalSelect(greater, difference, Vector<float>.Zero); | |
// Second: let's compute the values from the "lower" part of the algorithm, | |
// i.e. for values that are lower than the cutoff point. | |
// Lower part (simple) | |
Vector<float> lowerAllBits = channels * lowerScaleFactor; | |
// This applies the conditional mask based on whether the original elements were LESS THAN OR EQUAL TO the cutoff. | |
Vector<float> lower = Vector.ConditionalSelect(lessOrEqual, lowerAllBits, Vector<float>.Zero); | |
// Third: combine the two half-calculations. The two vectors will not overlap in either of their elements | |
// because of the nature of the ConditionalSelect operations we are doing. I've added a debug asseriton to be | |
// sure of that invariant. | |
#if DEBUG | |
for (int g = 0; g < Vector<float>.Count; g++) | |
{ | |
Debug.Assert(upper[g] == 0 || lower[g] == 0); | |
} | |
#endif | |
Vector<float> finalConverted = lower | upper; // Could also just add. | |
finalConverted.CopyTo(values, i); // Copy the results back into the array. | |
} | |
} | |
private static unsafe Vector<float> Pow(Vector<float> baseVec, Vector<float> exponentVec) | |
{ | |
// NOTE: THIS IS NOT VECTORIZED AT ALL. | |
// THIS MAKES THE VECTOR VERSION INSANELY SLOW. | |
float* arr = stackalloc float[Vector<float>.Count]; | |
for (int i = 0; i < Vector<float>.Count; i++) | |
{ | |
arr[i] = (float)Math.Pow(baseVec[i], exponentVec[i]); | |
} | |
return new Vector<float>(arr, 0); | |
} | |
private static unsafe Vector<float> PowScalar(float[] bases, int index, float exponent) | |
{ | |
// This should be slightly faster than the above implementation as it operates on the original float[] array. | |
float* vectorPtr = stackalloc[Vector<float>.Count]; | |
for (int i = 0; i < Vector<float>.Count; i++) | |
{ | |
vectorPtr[i] = Math.Pow(bases[index + i], exponent); | |
} | |
return new Vector<float>(vectorPtr); | |
} | |
// This is the unchanged algorithm from your repo. | |
private static float LinearToSRGB_Scalar(float signal) | |
{ | |
float a = 0.055f; | |
if (signal <= 0.0031308) | |
{ | |
return signal * 12.92f; | |
} | |
return ((float)((1 + a) * Math.Pow(signal, 1 / 2.4f))) - a; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment