wareya · March 15, 2017 21:34
diff --git a/demo.cpp b/demo.cpp
 #include "fft.hpp"

 #include <stdint.h>
 #include <stdio.h>
 #include <math.h> // cos
 #include <vector> // lazy
 #include <String.h> // strerror

 // "Hann" window. This is essentially just the shape of a cosine
 // from 0.0 to 1.0 to 0.0. The difference from a cosine window is
 // that the Hann window is completely interpolant (jargon meaning
 // that we don't have to normalize it when we overlap copies of it)
 // because it's horizontally flat at the start, middle, and end. A
 // real cosine window contains 50% the length of a single cycle of
 // the cosine, not 100%.
 /*   0         1
 1.0|   ....   |
    |  .    .  |
 0.0|..      ..|
    |          |
 -1.0|          |
 */
 inline double window(double pace)
 {
    return -cos(pace*M_PI*2)/2+0.5;
 }
 #define likely(x)    __builtin_expect (!!(x), 1)
 #define unlikely(x)  __builtin_expect (!!(x), 0)
 inline double get_sample(int16_t* v, size_t size, int64_t i)
 {
    if(unlikely(i < 0 or i >= size))
        return 0.0;
    else
        return double(v[i])/double(32768);
 }
 inline void set_sample(double* v, size_t size, int64_t i, double n)
 {
    if(likely(i >= 0 and i < size))
        v[i] = n;
 }
 int main()
 {
    // expects monaural signed 16-bit audio
    auto input = fopen("demo.pcm", "rb");
    auto output = fopen("output.pcm", "wb");
    
    if(!input or !output) return puts("Error opening a file"), 0;
    
    /*
    Doing an FFT on n samples requires n * log(n) operations.
    Therefore, extracting a single sample from an FFT is overkill,
     because convolution only requires n operations.
    But because FFTs are "cyclical" (that is, they act like the
     audio sample they're working on loops at the start and end)
     we can't just do an FFT on every block of audio, because it
     would cause leaking between the start and end of each block,
     which basically means clicking/distortion.
    To reduce this, FFTs are "windowed" (lapped; tapered; pick
     your own name). This doesn't give 100% perfect quality, but
     it allows you to perform 2*n*log(n) operations per n samples
     to do convolution operations, instead of n^2 operations, and
     if you use a good enough window and a large enough FFT, the
     distortion/clicking becomes unnoticable.
    */
    
    fseek(input, 0, SEEK_END);
    auto filesize = ftell(input);
    fseek(input, 0, SEEK_SET);
    
    auto count_input_samples = filesize/sizeof(int16_t); // truncates intentionally
    int16_t * input_samples = (int16_t*)malloc(count_input_samples*sizeof(int16_t));
    
    fread(input_samples, sizeof(int16_t), count_input_samples, input);
    
    // stores ouput waveform in floating point format
    auto count_output_samples = count_input_samples;
    double * output_samples = (double*)malloc(count_output_samples*sizeof(double));
    
    const int quality = 2048;
    const int span = quality*2; // size of the fft and its input data
    
    // temporary buffers
    char * heap = (char*)malloc(span*sizeof(double)*5);
    // put everything in a single contiguous heap to try to help the CPU cache
    double * windowed_samples = (double*)(heap+(span*sizeof(double)*0));
    double * real_bins        = (double*)(heap+(span*sizeof(double)*1));
    double * imag_bins        = (double*)(heap+(span*sizeof(double)*2));
    double * real_samples     = (double*)(heap+(span*sizeof(double)*3));
    double * imag_samples     = (double*)(heap+(span*sizeof(double)*4)); // don't ask...
    /*
    double * windowed_samples = malloc(span*sizeof(double));
    double * real_bins        = malloc(span*sizeof(double));
    double * imag_bins        = malloc(span*sizeof(double));
    double * real_samples     = malloc(span*sizeof(double));
    double * imag_samples     = malloc(span*sizeof(double)); // don't ask...
    */
    for(auto i = 0; i < count_input_samples; i += quality)
    {
        // copy input stream data into windowed buffer
        for(auto j = 0; j < span; j++) // starts at -quality from current sample "i"
        {
            double sample = get_sample(input_samples, count_input_samples, i+j-quality);
            windowed_samples[j] = sample * window(double(j)/(span));
        }
        
        // turn our spatial (waveform) data into frequency data
        fft(windowed_samples, nullptr, span, real_bins, imag_bins);
        
        // set all bins with a frequency above 50% of the nyquist frequency
        // (i.e. above 25% of the sample rate) to 0
        for(auto j = 0; j < span; j++)
        {
            // because the nyquist frequency is at bin span/2, with
            // clockwise frequencies being before it and counter-clockwise
            // frequencies being after it.
            // it's stupid, I know, but that's how fourier transforms
            // interpret the result of turning a complex (stereo) audio stream
            // to the frequency domain, and part of how the FFT I implemented
            // works internally.
            if(j > span/4 and j < span*3/4)
            {
                real_bins[j] = 0;
                imag_bins[j] = 0;
            }
        }
        
        // turn our filtered frequency data into filtered spatial (waveform) data
        ifft(real_bins, imag_bins, span, real_samples, imag_samples);
        
        // Add our filtered samples to the output stream. Note that I chose the
        // Hann window specifically because it makes this process absolutely
        // trivial, and if you use a different window, you have to actually
        // ensure that the "lapping" between windowed chunks has a smooth shape.
        for(auto j = 0; j < span; j++)
        {
            //set_sample(output_samples, i+j-quality, real_samples[j]);
            int64_t pos = i+j-quality;
            if(pos >= 0 and pos < count_output_samples)
                output_samples[pos] += real_samples[j];
        }
    }
    
    for(auto i = 0; i < count_output_samples; i++)
    {
        // convert range clamp
        double n = output_samples[i]*double(32768);
        if(n >  32767) n =  32767;
        if(n < -32768) n = -32768;
        
        int16_t sample = round(n);
        
        fwrite(&sample, sizeof(int16_t), 1, output);
    }
 }
	#include "fft.hpp"

	#include <stdint.h>
	#include <stdio.h>
	#include <math.h> // cos
	#include <vector> // lazy
	#include <String.h> // strerror

	// "Hann" window. This is essentially just the shape of a cosine
	// from 0.0 to 1.0 to 0.0. The difference from a cosine window is
	// that the Hann window is completely interpolant (jargon meaning
	// that we don't have to normalize it when we overlap copies of it)
	// because it's horizontally flat at the start, middle, and end. A
	// real cosine window contains 50% the length of a single cycle of
	// the cosine, not 100%.
	/* 0 1
	1.0\| .... \|
	\| . . \|
	0.0\|.. ..\|
	\| \|
	-1.0\| \|
	*/
	inline double window(double pace)
	{
	return -cos(paceM_PI2)/2+0.5;
	}
	#define likely(x) __builtin_expect (!!(x), 1)
	#define unlikely(x) __builtin_expect (!!(x), 0)
	inline double get_sample(int16_t* v, size_t size, int64_t i)
	{
	if(unlikely(i < 0 or i >= size))
	return 0.0;
	else
	return double(v[i])/double(32768);
	}
	inline void set_sample(double* v, size_t size, int64_t i, double n)
	{
	if(likely(i >= 0 and i < size))
	v[i] = n;
	}
	int main()
	{
	// expects monaural signed 16-bit audio
	auto input = fopen("demo.pcm", "rb");
	auto output = fopen("output.pcm", "wb");

	if(!input or !output) return puts("Error opening a file"), 0;

	/*
	Doing an FFT on n samples requires n * log(n) operations.
	Therefore, extracting a single sample from an FFT is overkill,
	because convolution only requires n operations.
	But because FFTs are "cyclical" (that is, they act like the
	audio sample they're working on loops at the start and end)
	we can't just do an FFT on every block of audio, because it
	would cause leaking between the start and end of each block,
	which basically means clicking/distortion.
	To reduce this, FFTs are "windowed" (lapped; tapered; pick
	your own name). This doesn't give 100% perfect quality, but
	it allows you to perform 2nlog(n) operations per n samples
	to do convolution operations, instead of n^2 operations, and
	if you use a good enough window and a large enough FFT, the
	distortion/clicking becomes unnoticable.
	*/

	fseek(input, 0, SEEK_END);
	auto filesize = ftell(input);
	fseek(input, 0, SEEK_SET);

	auto count_input_samples = filesize/sizeof(int16_t); // truncates intentionally
	int16_t * input_samples = (int16_t)malloc(count_input_samplessizeof(int16_t));

	fread(input_samples, sizeof(int16_t), count_input_samples, input);

	// stores ouput waveform in floating point format
	auto count_output_samples = count_input_samples;
	double * output_samples = (double)malloc(count_output_samplessizeof(double));

	const int quality = 2048;
	const int span = quality*2; // size of the fft and its input data

	// temporary buffers
	char * heap = (char)malloc(spansizeof(double)*5);
	// put everything in a single contiguous heap to try to help the CPU cache
	double * windowed_samples = (double)(heap+(spansizeof(double)*0));
	double * real_bins = (double)(heap+(spansizeof(double)*1));
	double * imag_bins = (double)(heap+(spansizeof(double)*2));
	double * real_samples = (double)(heap+(spansizeof(double)*3));
	double * imag_samples = (double)(heap+(spansizeof(double)*4)); // don't ask...
	/*
	double * windowed_samples = malloc(span*sizeof(double));
	double * real_bins = malloc(span*sizeof(double));
	double * imag_bins = malloc(span*sizeof(double));
	double * real_samples = malloc(span*sizeof(double));
	double * imag_samples = malloc(span*sizeof(double)); // don't ask...
	*/
	for(auto i = 0; i < count_input_samples; i += quality)
	{
	// copy input stream data into windowed buffer
	for(auto j = 0; j < span; j++) // starts at -quality from current sample "i"
	{
	double sample = get_sample(input_samples, count_input_samples, i+j-quality);
	windowed_samples[j] = sample * window(double(j)/(span));
	}

	// turn our spatial (waveform) data into frequency data
	fft(windowed_samples, nullptr, span, real_bins, imag_bins);

	// set all bins with a frequency above 50% of the nyquist frequency
	// (i.e. above 25% of the sample rate) to 0
	for(auto j = 0; j < span; j++)
	{
	// because the nyquist frequency is at bin span/2, with
	// clockwise frequencies being before it and counter-clockwise
	// frequencies being after it.
	// it's stupid, I know, but that's how fourier transforms
	// interpret the result of turning a complex (stereo) audio stream
	// to the frequency domain, and part of how the FFT I implemented
	// works internally.
	if(j > span/4 and j < span*3/4)
	{
	real_bins[j] = 0;
	imag_bins[j] = 0;
	}
	}

	// turn our filtered frequency data into filtered spatial (waveform) data
	ifft(real_bins, imag_bins, span, real_samples, imag_samples);

	// Add our filtered samples to the output stream. Note that I chose the
	// Hann window specifically because it makes this process absolutely
	// trivial, and if you use a different window, you have to actually
	// ensure that the "lapping" between windowed chunks has a smooth shape.
	for(auto j = 0; j < span; j++)
	{
	//set_sample(output_samples, i+j-quality, real_samples[j]);
	int64_t pos = i+j-quality;
	if(pos >= 0 and pos < count_output_samples)
	output_samples[pos] += real_samples[j];
	}
	}

	for(auto i = 0; i < count_output_samples; i++)
	{
	// convert range clamp
	double n = output_samples[i]*double(32768);
	if(n > 32767) n = 32767;
	if(n < -32768) n = -32768;

	int16_t sample = round(n);

	fwrite(&sample, sizeof(int16_t), 1, output);
	}
	}