This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_autovec(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
for (auto i = 0; i < vector_len; ++i) { | |
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i]; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rcx+rbx] | |
vmovups ymm2,ymmword ptr [r10+rcx] | |
vmovups ymm3,ymmword ptr [rcx] | |
vmovups ymm0,ymmword ptr [r11+rcx] | |
lea rcx,[rcx+20h] | |
vmulps ymm1,ymm2,ymm2 | |
vfmadd231ps ymm1,ymm3,ymm3 | |
vfmadd231ps ymm1,ymm0,ymm0 | |
vmovups ymmword ptr [rax+r8],ymm1 | |
sub r9,1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_vectorized(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
const auto px = (__m256*)xs.data(); | |
const auto py = (__m256*)ys.data(); | |
const auto pz = (__m256*)zs.data(); | |
auto pd = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; | |
while (i--) { | |
pd[i] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(px[i], px[i]), _mm256_mul_ps(py[i], py[i])), _mm256_mul_ps(pz[i], pz[i])); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-20h] | |
vmovups ymm0,ymmword ptr [rbx+rax] | |
vmovups ymm2,ymmword ptr [rax] | |
vmovups ymm3,ymmword ptr [rdi+rax] | |
vmulps ymm1,ymm0,ymm0 | |
vmulps ymm0,ymm2,ymm2 | |
vaddps ymm2,ymm1,ymm0 | |
vmulps ymm1,ymm3,ymm3 | |
vaddps ymm2,ymm2,ymm1 | |
vmovups ymmword ptr [rsi+rax],ymm2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_vectorized_fma(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
const auto px = (__m256*)xs.data(); | |
const auto py = (__m256*)ys.data(); | |
const auto pz = (__m256*)zs.data(); | |
auto pd = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; | |
while (i--) { | |
pd[i] = _mm256_fmadd_ps(pz[i], pz[i], _mm256_fmadd_ps(py[i], py[i], _mm256_mul_ps(px[i], px[i]))); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-20h] | |
vmovups ymm0,ymmword ptr [rbx+rax] | |
vmovups ymm2,ymmword ptr [rax] | |
vmovups ymm3,ymmword ptr [rdi+rax] | |
vmulps ymm0,ymm0,ymm0 | |
vfmadd231ps ymm0,ymm2,ymm2 | |
vfmadd231ps ymm0,ymm3,ymm3 | |
vmovups ymmword ptr [rsi+rax],ymm0 | |
sub r11d,1 | |
jne dot3_soa_vectorized+52h (07FF7DF9913D2h) |
OlderNewer