dlandahl · October 12, 2021 11:06
diff --git a/avx_sine.c b/avx_sine.c
 #if AVX2_ENABLED
 __m256 read_vector(__m256 phase) {
    phase = _mm256_mul_ps(phase, _mm256_set1_ps(TABLE_SIZE));
    __m256i a = _mm256_cvtps_epi32(_mm256_floor_ps(phase));
    __m256i b = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_add_ps(phase, _mm256_set1_ps(1))));

    __m256 t     = _mm256_sub_ps(phase, _mm256_cvtepi32_ps(a));
    __m256 lower = _mm256_i32gather_ps(sin_table, a, 4);
    __m256 upper = _mm256_i32gather_ps(sin_table, b, 4);

    upper = _mm256_mul_ps(upper, t);
    t     = _mm256_sub_ps(_mm256_set1_ps(1), t);
    lower = _mm256_mul_ps(lower, t);
    return _mm256_add_ps(lower, upper);
 }

 void vector256_table_additive_synthesis(f32* data, int count) {
    memset(data, 0, count * sizeof(f32));

    assert(count % 8 == 0);
    f64 mul = 50.0 / SAMPLE_RATE;

    __m256 indices = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
    __m256 max     = _mm256_set1_ps(1);

    for (int p = 1; p <= 10; p++) {
        f64 phase = 0;
        f64 step_size = p * mul;

        __m256 amplitude = _mm256_set1_ps(1.0 / p);
        __m256 offset = _mm256_mul_ps(_mm256_set1_ps(step_size), indices);

        for (int n = 0; n < count; n += 8) {
            __m256 vdata  = _mm256_load_ps(data + n);
            __m256 vphase = _mm256_add_ps(_mm256_set1_ps(phase), offset);

            __m256 mask = _mm256_cmp_ps(vphase, max, _CMP_NLT_UQ);
            vphase      = _mm256_sub_ps(vphase, _mm256_and_ps(max, mask));

            __m256 sine =  read_vector(vphase);
            sine        = _mm256_mul_ps(sine, amplitude);
            vdata       = _mm256_add_ps(sine, vdata);
            _mm256_store_ps(data + n, vdata);

            phase += step_size * 8;
            phase -= 1 * (phase > 1);
        }
    }
 }
 #endif
	#if AVX2_ENABLED
	__m256 read_vector(__m256 phase) {
	phase = _mm256_mul_ps(phase, _mm256_set1_ps(TABLE_SIZE));
	__m256i a = _mm256_cvtps_epi32(_mm256_floor_ps(phase));
	__m256i b = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_add_ps(phase, _mm256_set1_ps(1))));

	__m256 t = _mm256_sub_ps(phase, _mm256_cvtepi32_ps(a));
	__m256 lower = _mm256_i32gather_ps(sin_table, a, 4);
	__m256 upper = _mm256_i32gather_ps(sin_table, b, 4);

	upper = _mm256_mul_ps(upper, t);
	t = _mm256_sub_ps(_mm256_set1_ps(1), t);
	lower = _mm256_mul_ps(lower, t);
	return _mm256_add_ps(lower, upper);
	}

	void vector256_table_additive_synthesis(f32* data, int count) {
	memset(data, 0, count * sizeof(f32));

	assert(count % 8 == 0);
	f64 mul = 50.0 / SAMPLE_RATE;

	__m256 indices = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
	__m256 max = _mm256_set1_ps(1);

	for (int p = 1; p <= 10; p++) {
	f64 phase = 0;
	f64 step_size = p * mul;

	__m256 amplitude = _mm256_set1_ps(1.0 / p);
	__m256 offset = _mm256_mul_ps(_mm256_set1_ps(step_size), indices);

	for (int n = 0; n < count; n += 8) {
	__m256 vdata = _mm256_load_ps(data + n);
	__m256 vphase = _mm256_add_ps(_mm256_set1_ps(phase), offset);

	__m256 mask = _mm256_cmp_ps(vphase, max, _CMP_NLT_UQ);
	vphase = _mm256_sub_ps(vphase, _mm256_and_ps(max, mask));

	__m256 sine = read_vector(vphase);
	sine = _mm256_mul_ps(sine, amplitude);
	vdata = _mm256_add_ps(sine, vdata);
	_mm256_store_ps(data + n, vdata);

	phase += step_size * 8;
	phase -= 1 * (phase > 1);
	}
	}
	}
	#endif
No results found