The code below emulates "restrict", the function argument attribute that indicates the ranges of the pointers do not overlap. It explains its limitations:
struct Aligned {
alignas(1024) double d;
};
using u = unsigned;
constexpr u count = 1 << 10;
using ul = unsigned long;
void max(double *r, const double *left, const double *right) {
for(int i = 0; i < count; ++i ) {
r[i] = left[i] < right[i] ? right[i] : left[i];
}
}
// The following code shows it is currently possible
// for Clang to deduce overlapping does not occur
//
// However, other rules of the standard language
// prevents programmers from using a generalization
// of the idiom shown below
//
// The idiom is very brittle,
// it relies on Clang realizing that
// count <= count + long(o1)
// count <= count + long(o2)
// because o1, o2 are 32 bit numbers
// thus, it is barely able to prove
// count <= (left - r)
// and count <= (right - left)
// that is, there is no overlap.
//
// We don't need "restrict", what we need are
// ways to tell the compiler general assertions
void max(Aligned *a, u o1, u o2) {
auto r = &a->d;
auto left = r + (count + long(o1));
auto right = left + (count + long(o2));
max(r, left, right);
}
#include <assert.h>
// one annoyance with the current standard:
void call_max(double *r, double *s1, double *s2) {
// suppose the program makes it so that these
// assertions are always true here
assert(count <= (s1 - r));
assert((s1 - r) < (1 << 32));
assert(count <= (s2 - s1));
assert((s2 - s2) < (1 << 32));
// and all of them are aligned to at least 1024
auto diff1 = (s1 - r);
auto diff2 = (s2 - s1);
max(
reinterpret_cast<Aligned *>(r), //<- undefined behavior
// there is no way to say
// "I know what I am doing, you can now treat r as an Aligned *
// This is a case in which the type system forces rules
// on the programmers without recourse, not the C++ way
// that otherwise let's you hurt yourself with all sorts
// of "nuclear powered foot machine guns"
diff1,
diff2
);
}
For that code, Clang is able to leverage that the ranges do not overlap to enable vectorization, if allowed to work on AVX512, it is able to operate on 8 double
at a time see code in the Compiler Explorer.
For brevity, let us look at "call_max":
call_max(double*, double*, double*): # @call_max(double*, double*, double*)
sub rdx, rsi
sub rsi, rdi
shr rdx, 3
movabs rax, 34359738360
and rax, rsi
add rax, rdi
add rax, 8192
mov ecx, edx
xor edx, edx
.LBB2_1: # =>This Inner Loop Header: Depth=1
vmovupd zmm0, zmmword ptr [rax + 8*rcx + 8192]
vmaxpd zmm0, zmm0, zmmword ptr [rax]
vmovupd zmmword ptr [rdi + 8*rdx], zmm0
add rdx, 8
add rax, 64
cmp rdx, 1024
jne .LBB2_1
vzeroupper
ret
There is only one conditional, to loop back.