This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void Dot3SoaVectorized(float[] xs, float[] ys, float[] zs, float[] dp) { | |
for (var j = 0; j < reps; ++j) { | |
for (var i = 0; i < dp.Length; i += laneWidth) { | |
var x = new Vector<float>(xs, i); | |
var y = new Vector<float>(ys, i); | |
var z = new Vector<float>(zs, i); | |
var d = x * x + y * y + z * z; | |
d.CopyTo(dp, i); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea ebp,[r11+7] | |
cmp ebp,esi | |
jae 00007FF98D06828A | |
vmovupd ymm0,ymmword ptr [rcx+r11*4+10h] | |
cmp ebp,edi | |
jae 00007FF98D06828A | |
vmovupd ymm1,ymmword ptr [rdx+r11*4+10h] | |
cmp ebp,ebx | |
jae 00007FF98D06828A | |
vmovupd ymm2,ymmword ptr [r8+r11*4+10h] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_scalar(const vector<Vec3f>& vs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
auto i = vector_len; | |
while (i--) { | |
dp[i] = vs[i].x * vs[i].x + vs[i].y * vs[i].y + vs[i].z * vs[i].z; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-0Ch] | |
lea rdx,[rdx-4] | |
vmovss xmm0,dword ptr [rax-8] | |
vmovss xmm2,dword ptr [rax-4] | |
vmovss xmm3,dword ptr [rax] | |
vmulss xmm1,xmm0,xmm0 | |
vmulss xmm0,xmm2,xmm2 | |
vaddss xmm2,xmm1,xmm0 | |
vmulss xmm1,xmm3,xmm3 | |
vaddss xmm2,xmm2,xmm1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_vector_dp(const vector<Vec3f>& vs, vector<float>& dp) { | |
// 0000 0000 0111 0001: mul lower three components, store sum in lowest component | |
static const auto mask = 0x71; | |
for (auto j = 0; j < reps; ++j) { | |
const auto pvs = (float*)vs.data(); | |
auto pdp = (float*)dp.data(); | |
auto i = vector_len; | |
while (i--) { | |
// load 16 bytes (xyz|x) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rcx,[rcx-0Ch] | |
lea rax,[rax-4] | |
vmovups xmm0,xmmword ptr [rcx] | |
vdpps xmm0,xmm0,xmm0,71h | |
vmovss dword ptr [rax],xmm0 | |
sub edx,1 | |
jne benchmark<<lambda_d1ac89d5e59a169233af7a419374e043>,<lambda_c092a5680821d9f5b8bc5a7043f59100> >+0C0h (07FF7B83F2100h) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_vector_gather(const vector<Vec3f>& vs, vector<float>& dp) { | |
static const auto epi32_one = _mm256_set1_epi32(1); | |
static const auto x_offsets = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21); | |
static const auto y_offsets = _mm256_add_epi32(x_offsets, epi32_one); | |
static const auto z_offsets = _mm256_add_epi32(y_offsets, epi32_one); | |
for (auto j = 0; j < reps; ++j) { | |
const auto pvs = (float*)vs.data(); | |
auto pdp = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-60h] | |
lea r8,[r8-20h] | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm5,ymm0 | |
vgatherdps ymm5,dword ptr [rax+ymm6*4],ymm2 | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm4,ymm0 | |
vgatherdps ymm4,dword ptr [rax+ymm7*4],ymm2 | |
vpcmpeqb ymm2,ymm2,ymm2 | |
vmovups ymm1,ymm0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_soa_scalar(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
auto i = vector_len; | |
while(i--) { | |
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i]; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-4] | |
vmovss xmm0,dword ptr [rax] | |
vmovss xmm2,dword ptr [rdx+rax] | |
vmovss xmm3,dword ptr [r8+rax] | |
vmulss xmm1,xmm0,xmm0 | |
vmulss xmm0,xmm2,xmm2 | |
vaddss xmm2,xmm1,xmm0 | |
vmulss xmm1,xmm3,xmm3 | |
vaddss xmm2,xmm2,xmm1 | |
vmovss dword ptr [r9+rax],xmm2 |