Skip to content

Instantly share code, notes, and snippets.

View FrankNiemeyer's full-sized avatar

Frank Niemeyer FrankNiemeyer

View GitHub Profile
void dot3_soa_autovec(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
for (auto i = 0; i < vector_len; ++i) {
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i];
}
}
}
lea rax,[rcx+rbx]
vmovups ymm2,ymmword ptr [r10+rcx]
vmovups ymm3,ymmword ptr [rcx]
vmovups ymm0,ymmword ptr [r11+rcx]
lea rcx,[rcx+20h]
vmulps ymm1,ymm2,ymm2
vfmadd231ps ymm1,ymm3,ymm3
vfmadd231ps ymm1,ymm0,ymm0
vmovups ymmword ptr [rax+r8],ymm1
sub r9,1
void dot3_soa_vectorized(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
const auto px = (__m256*)xs.data();
const auto py = (__m256*)ys.data();
const auto pz = (__m256*)zs.data();
auto pd = (__m256*)dp.data();
auto i = vector_len / lane_width;
while (i--) {
pd[i] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(px[i], px[i]), _mm256_mul_ps(py[i], py[i])), _mm256_mul_ps(pz[i], pz[i]));
}
lea rax,[rax-20h]
vmovups ymm0,ymmword ptr [rbx+rax]
vmovups ymm2,ymmword ptr [rax]
vmovups ymm3,ymmword ptr [rdi+rax]
vmulps ymm1,ymm0,ymm0
vmulps ymm0,ymm2,ymm2
vaddps ymm2,ymm1,ymm0
vmulps ymm1,ymm3,ymm3
vaddps ymm2,ymm2,ymm1
vmovups ymmword ptr [rsi+rax],ymm2
void dot3_soa_vectorized_fma(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
const auto px = (__m256*)xs.data();
const auto py = (__m256*)ys.data();
const auto pz = (__m256*)zs.data();
auto pd = (__m256*)dp.data();
auto i = vector_len / lane_width;
while (i--) {
pd[i] = _mm256_fmadd_ps(pz[i], pz[i], _mm256_fmadd_ps(py[i], py[i], _mm256_mul_ps(px[i], px[i])));
}
lea rax,[rax-20h]
vmovups ymm0,ymmword ptr [rbx+rax]
vmovups ymm2,ymmword ptr [rax]
vmovups ymm3,ymmword ptr [rdi+rax]
vmulps ymm0,ymm0,ymm0
vfmadd231ps ymm0,ymm2,ymm2
vfmadd231ps ymm0,ymm3,ymm3
vmovups ymmword ptr [rsi+rax],ymm0
sub r11d,1
jne dot3_soa_vectorized+52h (07FF7DF9913D2h)