mellinoe · July 15, 2018 01:23
diff --git a/LinearToSRGB_VectorExample.cs b/LinearToSRGB_VectorExample.cs
 using System;
 using System.Diagnostics;
 using System.Numerics;

 class Program
 {
    static unsafe void Main(string[] args)
    {
        Console.WriteLine("Degree of vectorization: " + Vector<float>.Count);
        Random rand = new Random();
        float[] vectorConverts = new float[50000000];

        for (int i = 0; i < vectorConverts.Length; i++)
        {
            vectorConverts[i] = (float)rand.NextDouble();
        }
        float[] scalarConverts = (float[])vectorConverts.Clone();

        Stopwatch swVec = Stopwatch.StartNew();
        ConvertAllToSrgb(vectorConverts);
        swVec.Stop();

        Stopwatch swScalar = Stopwatch.StartNew();
        for (int i = 0; i < vectorConverts.Length; i++)
        {
            scalarConverts[i] = LinearToSRGB_Scalar(scalarConverts[i]);
        }
        swScalar.Stop();

        const float maxPermissibleDelta = .000001f;
        // Here I am just going to check the result of my vector algorithm against the results
        // from your scalar implementation from your repo to verify that the results are correct.
        for (int i = 0; i < vectorConverts.Length; i++)
        {
            if (Math.Abs(vectorConverts[i] - scalarConverts[i]) >= maxPermissibleDelta)
            {
                Console.WriteLine("Difference at " + i.ToString("00000") + $": {vectorConverts[i]}");
                Console.WriteLine($"                  != {scalarConverts[i]}");
            }
        }

        Console.Write($"Finished. Scalar time: {swScalar.ElapsedMilliseconds} ms. Vector time: {swVec.ElapsedMilliseconds} ms.");
    }

    // These are essentially the contants in y our formula.
    private static readonly Vector<float> lowerLimit = new Vector<float>(0.0031308f);
    private static readonly Vector<float> lowerScaleFactor = new Vector<float>(12.92f);
    private static readonly Vector<float> aVec = new Vector<float>(0.055f);
    private static readonly Vector<float> aPlusOne = new Vector<float>(1.055f);
    private static readonly Vector<float> upperExponent = new Vector<float>(1f / 2.4f);

    private static unsafe void ConvertAllToSrgb(float[] values)
    {
        for (int i = 0; i < values.Length; i += Vector<float>.Count)
        {
            Vector<float> channels = new Vector<float>(values, i);
            Vector<int> lessOrEqual = Vector.LessThanOrEqual(channels, lowerLimit);
            Vector<int> greater = Vector.OnesComplement(lessOrEqual);
            
            /* The algorithm is split into two parts.
               * First, we compute the values that would be the result of the "upper" part of the computation,
                 i.e. for the values that are above the cutoff threshold.
               * Second, we compute the values from the "lower" part of the computation, i.e. the linear scalaing by 12.92f.
               * Third, we combine the two partial results from the first and second steps by using a ConditionalSelect
                 operation, which lets us "Select" a combination of elements from the above partial results to fill in
                 a correct final result, based on the comparison results of the threshold value.
            */

            // First: let's calculate the upper portion (the complicated part)
            Vector<float> powdVec = Pow(channels, upperExponent); // Math.Pow(signal, 1 / 2.4f)
            Vector<float> upperSum = aPlusOne * powdVec; // (a + 1) * (Math.Pow(signal, 1/ 2.4f))
            Vector<float> difference = upperSum - aVec; // ((a + 1) * (Math.Pow(signal, 1/ 2.4f))) - a
            
            // We need to "apply the branching" of the algorithm. The elements from the above computation
            // should only go into the result if the original value was greater than the cutoff value.
            // This applies the conditional mask based on whether the original elements were GREATER THAN the cutoff.
            Vector<float> upper = Vector.ConditionalSelect(greater, difference, Vector<float>.Zero);

            // Second: let's compute the values from the "lower" part of the algorithm,
            // i.e. for values that are lower than the cutoff point.

            // Lower part (simple)
            Vector<float> lowerAllBits = channels * lowerScaleFactor;
            // This applies the conditional mask based on whether the original elements were LESS THAN OR EQUAL TO the cutoff.
            Vector<float> lower = Vector.ConditionalSelect(lessOrEqual, lowerAllBits, Vector<float>.Zero);

            // Third: combine the two half-calculations. The two vectors will not overlap in either of their elements
            // because of the nature of the ConditionalSelect operations we are doing. I've added a debug asseriton to be
            // sure of that invariant.
 #if DEBUG
                for (int g = 0; g < Vector<float>.Count; g++)
                {
                    Debug.Assert(upper[g] == 0 || lower[g] == 0);
                }
 #endif
            Vector<float> finalConverted = lower | upper; // Could also just add.
            finalConverted.CopyTo(values, i); // Copy the results back into the array.
        }
    }

    private static unsafe Vector<float> Pow(Vector<float> baseVec, Vector<float> exponentVec)
    {
        // NOTE: THIS IS NOT VECTORIZED AT ALL.
        // THIS MAKES THE VECTOR VERSION INSANELY SLOW.
        float* arr = stackalloc float[Vector<float>.Count];
        for (int i = 0; i < Vector<float>.Count; i++)
        {
            arr[i] = (float)Math.Pow(baseVec[i], exponentVec[i]);
        }

        return new Vector<float>(arr, 0);
    }

    private static unsafe Vector<float> PowScalar(float[] bases, int index, float exponent)
    {
        // This should be slightly faster than the above implementation as it operates on the original float[] array.
        float* vectorPtr = stackalloc[Vector<float>.Count];
        for (int i = 0; i < Vector<float>.Count; i++)
        {
            vectorPtr[i] = Math.Pow(bases[index + i], exponent);
        }

        return new Vector<float>(vectorPtr);
    }

    // This is the unchanged algorithm from your repo.
    private static float LinearToSRGB_Scalar(float signal)
    {
        float a = 0.055f;

        if (signal <= 0.0031308)
        {
            return signal * 12.92f;
        }

        return ((float)((1 + a) * Math.Pow(signal, 1 / 2.4f))) - a;
    }
 }
	using System;
	using System.Diagnostics;
	using System.Numerics;

	class Program
	{
	static unsafe void Main(string[] args)
	{
	Console.WriteLine("Degree of vectorization: " + Vector<float>.Count);
	Random rand = new Random();
	float[] vectorConverts = new float[50000000];

	for (int i = 0; i < vectorConverts.Length; i++)
	{
	vectorConverts[i] = (float)rand.NextDouble();
	}
	float[] scalarConverts = (float[])vectorConverts.Clone();

	Stopwatch swVec = Stopwatch.StartNew();
	ConvertAllToSrgb(vectorConverts);
	swVec.Stop();

	Stopwatch swScalar = Stopwatch.StartNew();
	for (int i = 0; i < vectorConverts.Length; i++)
	{
	scalarConverts[i] = LinearToSRGB_Scalar(scalarConverts[i]);
	}
	swScalar.Stop();

	const float maxPermissibleDelta = .000001f;
	// Here I am just going to check the result of my vector algorithm against the results
	// from your scalar implementation from your repo to verify that the results are correct.
	for (int i = 0; i < vectorConverts.Length; i++)
	{
	if (Math.Abs(vectorConverts[i] - scalarConverts[i]) >= maxPermissibleDelta)
	{
	Console.WriteLine("Difference at " + i.ToString("00000") + $": {vectorConverts[i]}");
	Console.WriteLine($" != {scalarConverts[i]}");
	}
	}

	Console.Write($"Finished. Scalar time: {swScalar.ElapsedMilliseconds} ms. Vector time: {swVec.ElapsedMilliseconds} ms.");
	}

	// These are essentially the contants in y our formula.
	private static readonly Vector<float> lowerLimit = new Vector<float>(0.0031308f);
	private static readonly Vector<float> lowerScaleFactor = new Vector<float>(12.92f);
	private static readonly Vector<float> aVec = new Vector<float>(0.055f);
	private static readonly Vector<float> aPlusOne = new Vector<float>(1.055f);
	private static readonly Vector<float> upperExponent = new Vector<float>(1f / 2.4f);

	private static unsafe void ConvertAllToSrgb(float[] values)
	{
	for (int i = 0; i < values.Length; i += Vector<float>.Count)
	{
	Vector<float> channels = new Vector<float>(values, i);
	Vector<int> lessOrEqual = Vector.LessThanOrEqual(channels, lowerLimit);
	Vector<int> greater = Vector.OnesComplement(lessOrEqual);

	/* The algorithm is split into two parts.
	* First, we compute the values that would be the result of the "upper" part of the computation,
	i.e. for the values that are above the cutoff threshold.
	* Second, we compute the values from the "lower" part of the computation, i.e. the linear scalaing by 12.92f.
	* Third, we combine the two partial results from the first and second steps by using a ConditionalSelect
	operation, which lets us "Select" a combination of elements from the above partial results to fill in
	a correct final result, based on the comparison results of the threshold value.
	*/

	// First: let's calculate the upper portion (the complicated part)
	Vector<float> powdVec = Pow(channels, upperExponent); // Math.Pow(signal, 1 / 2.4f)
	Vector<float> upperSum = aPlusOne * powdVec; // (a + 1) * (Math.Pow(signal, 1/ 2.4f))
	Vector<float> difference = upperSum - aVec; // ((a + 1) * (Math.Pow(signal, 1/ 2.4f))) - a

	// We need to "apply the branching" of the algorithm. The elements from the above computation
	// should only go into the result if the original value was greater than the cutoff value.
	// This applies the conditional mask based on whether the original elements were GREATER THAN the cutoff.
	Vector<float> upper = Vector.ConditionalSelect(greater, difference, Vector<float>.Zero);

	// Second: let's compute the values from the "lower" part of the algorithm,
	// i.e. for values that are lower than the cutoff point.

	// Lower part (simple)
	Vector<float> lowerAllBits = channels * lowerScaleFactor;
	// This applies the conditional mask based on whether the original elements were LESS THAN OR EQUAL TO the cutoff.
	Vector<float> lower = Vector.ConditionalSelect(lessOrEqual, lowerAllBits, Vector<float>.Zero);

	// Third: combine the two half-calculations. The two vectors will not overlap in either of their elements
	// because of the nature of the ConditionalSelect operations we are doing. I've added a debug asseriton to be
	// sure of that invariant.
	#if DEBUG
	for (int g = 0; g < Vector<float>.Count; g++)
	{
	Debug.Assert(upper[g] == 0 \|\| lower[g] == 0);
	}
	#endif
	Vector<float> finalConverted = lower \| upper; // Could also just add.
	finalConverted.CopyTo(values, i); // Copy the results back into the array.
	}
	}

	private static unsafe Vector<float> Pow(Vector<float> baseVec, Vector<float> exponentVec)
	{
	// NOTE: THIS IS NOT VECTORIZED AT ALL.
	// THIS MAKES THE VECTOR VERSION INSANELY SLOW.
	float* arr = stackalloc float[Vector<float>.Count];
	for (int i = 0; i < Vector<float>.Count; i++)
	{
	arr[i] = (float)Math.Pow(baseVec[i], exponentVec[i]);
	}

	return new Vector<float>(arr, 0);
	}

	private static unsafe Vector<float> PowScalar(float[] bases, int index, float exponent)
	{
	// This should be slightly faster than the above implementation as it operates on the original float[] array.
	float* vectorPtr = stackalloc[Vector<float>.Count];
	for (int i = 0; i < Vector<float>.Count; i++)
	{
	vectorPtr[i] = Math.Pow(bases[index + i], exponent);
	}

	return new Vector<float>(vectorPtr);
	}

	// This is the unchanged algorithm from your repo.
	private static float LinearToSRGB_Scalar(float signal)
	{
	float a = 0.055f;

	if (signal <= 0.0031308)
	{
	return signal * 12.92f;
	}

	return ((float)((1 + a) * Math.Pow(signal, 1 / 2.4f))) - a;
	}
	}