rygorous · February 4, 2021 18:56 · stephenatwork · Feb 5, 2015 · AndrewPardoe · Aug 2, 2016
diff --git a/gistfile1.txt b/gistfile1.txt
 // Conjugate split-radix FFT inner loop

 // Both of these compiled with VC++ 2012, 32-bit, "/O2 /fp:fast".
 // NOTE: I also tried clang-cl and it seems to be primarily a VC++
 // problem. (Which doesn't help me much.)
 //
 // NOTE 2: argh, did the Clang test wrong. It's still primarily a
 // VC++ problem, but Clang has a notable slowdown too. Anyway, new
 // results generated automatically from a simpler standalone test
 // where I can toggle between versions using a single commandline
 // switch to prevent further mistakes.

 // Compiler flags used:
 // VC++ = VC++ 2012  /fp:fast /O2 /D_HAS_EXCEPTIONS=0 (no exceptions to be fair since clang_cl currently doesn't support them)
 // Clang = clang-cl 3.5.0 /O3 /D_HAS_EXCEPTIONS=0
 //
 // This is computing FFTs on an input vector that is just 512
 // 1s. Boring but an easy test.
 //
 // Code that VC++ 2012 outputs is here: https://gist.github.com/rygorous/a603c36d5b5288c96fb1

 // ----- Variant 1: this uses
 //
 //   #include <complex>
 //   typedef std::complex<float> complexf;

    for (size_t k = 0; k < N1; k++)
    {
        complexf Uk    = out0[k];
        complexf Uk_N1 = out1[k];
        complexf w     = twiddle[k];

        // Twiddle Zk, Z'k then butterfly
        complexf Zk    = w * out2[k];
        complexf Zpk   = std::conj(w) * out3[k];

        complexf Zsum = Zk + Zpk;
        complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk);

        out0[k] = Uk    + Zsum;
        out1[k] = Uk_N1 + Zdif;
        out2[k] = Uk    - Zsum;
        out3[k] = Uk_N1 - Zdif;
    }
    
 // results for FFT: (stats over 1 million runs)

 ---- VC++ std::complex
 Complex N=512, shortest=33477 cycles, avg 34521.76
 Real    N=512, shortest=18073 cycles, avg 18424.41
 ---- Clang std::complex
 Complex N=512, shortest=19988 cycles, avg 20576.95
 Real    N=512, shortest=11027 cycles, avg 11682.85

 // ----- Variant 2: this one just has a struct.
 //
 //   struct complexf { float re, im; };

    for (size_t k = 0; k < N1; k++)
    {
        complexf const &w = twiddle[k];
        complexf const &in2 = out2[k];
        complexf const &in3 = out3[k];

        float Zkr  = w.re*in2.re - w.im*in2.im;  
        float Zki  = w.re*in2.im + w.im*in2.re;
        float Zpkr = w.re*in3.re + w.im*in3.im;
        float Zpki = w.re*in3.im - w.im*in3.re;
        
        float Zsumr = Zkr + Zpkr;
        float Zsumi = Zki + Zpki;
        float Zdifr = Zki - Zpki;
        float Zdifi = Zpkr - Zkr;

        out2[k].re = out0[k].re - Zsumr;
        out2[k].im = out0[k].im - Zsumi;
        out0[k].re += Zsumr;
        out0[k].im += Zsumi;
        out3[k].re = out1[k].re - Zdifr;
        out3[k].im = out1[k].im - Zdifi;
        out1[k].re += Zdifr;
        out1[k].im += Zdifi;
    }

 // result for FFT: (stats over 1 million runs)

 ---- VC++ complexf
 Complex N=512, shortest=12999 cycles, avg 13779.46
 Real    N=512, shortest=8528 cycles, avg 9113.15
 ---- Clang complexf
 Complex N=512, shortest=12101 cycles, avg 12782.80
 Real    N=512, shortest=7234 cycles, avg 7652.84
	// Conjugate split-radix FFT inner loop

	// Both of these compiled with VC++ 2012, 32-bit, "/O2 /fp:fast".
	// NOTE: I also tried clang-cl and it seems to be primarily a VC++
	// problem. (Which doesn't help me much.)
	//
	// NOTE 2: argh, did the Clang test wrong. It's still primarily a
	// VC++ problem, but Clang has a notable slowdown too. Anyway, new
	// results generated automatically from a simpler standalone test
	// where I can toggle between versions using a single commandline
	// switch to prevent further mistakes.

	// Compiler flags used:
	// VC++ = VC++ 2012 /fp:fast /O2 /D_HAS_EXCEPTIONS=0 (no exceptions to be fair since clang_cl currently doesn't support them)
	// Clang = clang-cl 3.5.0 /O3 /D_HAS_EXCEPTIONS=0
	//
	// This is computing FFTs on an input vector that is just 512
	// 1s. Boring but an easy test.
	//
	// Code that VC++ 2012 outputs is here: https://gist.github.com/rygorous/a603c36d5b5288c96fb1

	// ----- Variant 1: this uses
	//
	// #include <complex>
	// typedef std::complex<float> complexf;

	for (size_t k = 0; k < N1; k++)
	{
	complexf Uk = out0[k];
	complexf Uk_N1 = out1[k];
	complexf w = twiddle[k];

	// Twiddle Zk, Z'k then butterfly
	complexf Zk = w * out2[k];
	complexf Zpk = std::conj(w) * out3[k];

	complexf Zsum = Zk + Zpk;
	complexf Zdif = complexf(0.0f, -1.0f) * (Zk - Zpk);

	out0[k] = Uk + Zsum;
	out1[k] = Uk_N1 + Zdif;
	out2[k] = Uk - Zsum;
	out3[k] = Uk_N1 - Zdif;
	}

	// results for FFT: (stats over 1 million runs)

	---- VC++ std::complex
	Complex N=512, shortest=33477 cycles, avg 34521.76
	Real N=512, shortest=18073 cycles, avg 18424.41
	---- Clang std::complex
	Complex N=512, shortest=19988 cycles, avg 20576.95
	Real N=512, shortest=11027 cycles, avg 11682.85

	// ----- Variant 2: this one just has a struct.
	//
	// struct complexf { float re, im; };

	for (size_t k = 0; k < N1; k++)
	{
	complexf const &w = twiddle[k];
	complexf const &in2 = out2[k];
	complexf const &in3 = out3[k];

	float Zkr = w.rein2.re - w.imin2.im;
	float Zki = w.rein2.im + w.imin2.re;
	float Zpkr = w.rein3.re + w.imin3.im;
	float Zpki = w.rein3.im - w.imin3.re;

	float Zsumr = Zkr + Zpkr;
	float Zsumi = Zki + Zpki;
	float Zdifr = Zki - Zpki;
	float Zdifi = Zpkr - Zkr;

	out2[k].re = out0[k].re - Zsumr;
	out2[k].im = out0[k].im - Zsumi;
	out0[k].re += Zsumr;
	out0[k].im += Zsumi;
	out3[k].re = out1[k].re - Zdifr;
	out3[k].im = out1[k].im - Zdifi;
	out1[k].re += Zdifr;
	out1[k].im += Zdifi;
	}

	// result for FFT: (stats over 1 million runs)

	---- VC++ complexf
	Complex N=512, shortest=12999 cycles, avg 13779.46
	Real N=512, shortest=8528 cycles, avg 9113.15
	---- Clang complexf
	Complex N=512, shortest=12101 cycles, avg 12782.80
	Real N=512, shortest=7234 cycles, avg 7652.84