This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-20h] | |
vmovups ymm0,ymmword ptr [rbx+rax] | |
vmovups ymm2,ymmword ptr [rax] | |
vmovups ymm3,ymmword ptr [rdi+rax] | |
vmulps ymm0,ymm0,ymm0 | |
vfmadd231ps ymm0,ymm2,ymm2 | |
vfmadd231ps ymm0,ymm3,ymm3 | |
vmovups ymmword ptr [rsi+rax],ymm0 | |
sub r11d,1 | |
jne dot3_soa_vectorized+52h (07FF7DF9913D2h) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_vectorized_fma(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
const auto px = (__m256*)xs.data(); | |
const auto py = (__m256*)ys.data(); | |
const auto pz = (__m256*)zs.data(); | |
auto pd = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; | |
while (i--) { | |
pd[i] = _mm256_fmadd_ps(pz[i], pz[i], _mm256_fmadd_ps(py[i], py[i], _mm256_mul_ps(px[i], px[i]))); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-20h] | |
vmovups ymm0,ymmword ptr [rbx+rax] | |
vmovups ymm2,ymmword ptr [rax] | |
vmovups ymm3,ymmword ptr [rdi+rax] | |
vmulps ymm1,ymm0,ymm0 | |
vmulps ymm0,ymm2,ymm2 | |
vaddps ymm2,ymm1,ymm0 | |
vmulps ymm1,ymm3,ymm3 | |
vaddps ymm2,ymm2,ymm1 | |
vmovups ymmword ptr [rsi+rax],ymm2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_vectorized(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
const auto px = (__m256*)xs.data(); | |
const auto py = (__m256*)ys.data(); | |
const auto pz = (__m256*)zs.data(); | |
auto pd = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; | |
while (i--) { | |
pd[i] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(px[i], px[i]), _mm256_mul_ps(py[i], py[i])), _mm256_mul_ps(pz[i], pz[i])); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rcx+rbx] | |
vmovups ymm2,ymmword ptr [r10+rcx] | |
vmovups ymm3,ymmword ptr [rcx] | |
vmovups ymm0,ymmword ptr [r11+rcx] | |
lea rcx,[rcx+20h] | |
vmulps ymm1,ymm2,ymm2 | |
vfmadd231ps ymm1,ymm3,ymm3 | |
vfmadd231ps ymm1,ymm0,ymm0 | |
vmovups ymmword ptr [rax+r8],ymm1 | |
sub r9,1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_autovec(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
for (auto i = 0; i < vector_len; ++i) { | |
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i]; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-4] | |
vmovss xmm0,dword ptr [rax] | |
vmovss xmm2,dword ptr [rdx+rax] | |
vmovss xmm3,dword ptr [r8+rax] | |
vmulss xmm1,xmm0,xmm0 | |
vmulss xmm0,xmm2,xmm2 | |
vaddss xmm2,xmm1,xmm0 | |
vmulss xmm1,xmm3,xmm3 | |
vaddss xmm2,xmm2,xmm1 | |
vmovss dword ptr [r9+rax],xmm2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_scalar(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
auto i = vector_len; | |
while(i--) { | |
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i]; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-60h] | |
lea r8,[r8-20h] | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm5,ymm0 | |
vgatherdps ymm5,dword ptr [rax+ymm6*4],ymm2 | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm4,ymm0 | |
vgatherdps ymm4,dword ptr [rax+ymm7*4],ymm2 | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm1,ymm0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_vector_gather(const vector<Vec3f>& vs, vector<float>& dp) { | |
static const auto epi32_one = _mm256_set1_epi32(1); | |
static const auto x_offsets = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21); | |
static const auto y_offsets = _mm256_add_epi32(x_offsets, epi32_one); | |
static const auto z_offsets = _mm256_add_epi32(y_offsets, epi32_one); | |
for (auto j = 0; j < reps; ++j) { | |
const auto pvs = (float*)vs.data(); | |
auto pdp = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; |
NewerOlder