Skip to content

Instantly share code, notes, and snippets.

@CAFxX
Last active March 6, 2025 08:58
Show Gist options
  • Save CAFxX/9aa833cc4ee9e27107f3272158e236ab to your computer and use it in GitHub Desktop.
Save CAFxX/9aa833cc4ee9e27107f3272158e236ab to your computer and use it in GitHub Desktop.
Sorting network for short arrays, branchless (CMOVxx, VMINSS/VMAXSS) - https://godbolt.org/z/v4G4xPofc
template <typename T>
static void sort(T* a, int l) {
#define S(i, j) { \
T t1 = a[i], t2 = a[j]; \
if (t1 > t2) { T t = t1; t1 = t2; t2 = t; } \
a[i] = t1, a[j] = t2;\
}
// Sorting networks from https://bertdobbelaere.github.io/sorting_networks.html
// Using the ones with lower CEs because every S(...) requires two CMOVxx or a
// VMINSS+VMAXSS pair, and it seems that's the limit per cycle on current
// processors (e.g. Zen5).
switch (l) {
case 0: break;
case 1: break;
case 2: S(0,1) break;
case 3: S(0,2) S(0,1) S(1,2) break;
case 4: S(0,2) S(1,3) S(0,1) S(2,3) S(1,2) break;
case 5: S(0,3) S(1,4) S(0,2) S(1,3) S(0,1) S(2,4) S(1,2) S(3,4) S(2,3) break;
case 6: S(0,5) S(1,3) S(2,4) S(1,2) S(3,4) S(0,3) S(2,5) S(0,1) S(2,3) S(4,5)
S(1,2) S(3,4) break;
case 7: S(0,6) S(2,3) S(4,5) S(0,2) S(1,4) S(3,6) S(0,1) S(2,5) S(3,4) S(1,2)
S(4,6) S(2,3) S(4,5) S(1,2) S(3,4) S(5,6) break;
case 8: S(0,2) S(1,3) S(4,6) S(5,7) S(0,4) S(1,5) S(2,6) S(3,7) S(0,1) S(2,3)
S(4,5) S(6,7) S(2,4) S(3,5) S(1,4) S(3,6) S(1,2) S(3,4) S(5,6) break;
case 9: S(0,3) S(1,7) S(2,5) S(4,8) S(0,7) S(2,4) S(3,8) S(5,6) S(0,2) S(1,3)
S(4,5) S(7,8) S(1,4) S(3,6) S(5,7) S(0,1) S(2,4) S(3,5) S(6,8) S(2,3)
S(4,5) S(6,7) S(1,2) S(3,4) S(5,6) break;
case 10: S(0,8) S(1,9) S(2,7) S(3,5) S(4,6) S(0,2) S(1,4) S(5,8) S(7,9) S(0,3)
S(0,1) S(2,9) S(3,7) S(4,5) S(8,9) S(2,4) S(5,7) S(6,9) S(0,1) S(3,6)
S(8,9) S(1,5) S(2,3) S(4,8) S(6,7) S(1,2) S(3,5) S(4,6) S(7,8) S(2,3)
S(4,5) S(6,7) S(3,4) S(5,6) break;
case 11: S(0,9) S(1,6) S(2,4) S(3,7) S(5,8) S(0,1) S(3,5) S(4,10) S(6,9) S(7,8)
S(1,3) S(2,5) S(4,7) S(8,10) S(0,4) S(1,2) S(3,7) S(5,9) S(6,8) S(0,1)
S(2,6) S(4,5) S(7,8) S(9,10) S(2,4) S(3,6) S(5,7) S(8,9) S(1,2) S(3,4)
S(5,6) S(7,8) S(2,3) S(4,5) S(6,7) break;
case 12: S(0,8) S(1,7) S(2,6) S(3,11) S(4,10) S(5,9) S(0,1) S(2,5) S(3,4) S(6,9)
S(7,8) S(10,11) S(0,2) S(1,6) S(5,10) S(9,11) S(0,3) S(1,2) S(4,6) S(5,7)
S(8,11) S(9,10) S(1,4) S(3,5) S(6,8) S(7,10) S(1,3) S(2,5) S(6,9) S(8,10)
S(2,3) S(4,5) S(6,7) S(8,9) S(4,6) S(5,7) S(3,4) S(5,6) S(7,8) break;
case 13: S(0,12) S(1,10) S(2,9) S(3,7) S(5,11) S(6,8) S(1,6) S(2,3) S(4,11) S(7,9)
S(8,10) S(0,4) S(1,2) S(3,6) S(7,8) S(9,10) S(11,12) S(4,6) S(5,9) S(8,11)
S(10,12) S(0,5) S(3,8) S(4,7) S(6,11) S(9,10) S(0,1) S(2,5) S(6,9) S(7,8)
S(10,11) S(1,3) S(2,4) S(5,6) S(9,10) S(1,2) S(3,4) S(5,7) S(6,8) S(2,3)
S(4,5) S(6,7) S(8,9) S(3,4) S(5,6) break;
case 14: S(0,1) S(2,3) S(4,5) S(6,7) S(8,9) S(10,11) S(12,13) S(0,2) S(1,3) S(4,8)
S(5,9) S(10,12) S(11,13) S(0,4) S(1,2) S(3,7) S(5,8) S(6,10) S(9,13) S(11,12)
S(0,6) S(1,5) S(3,9) S(4,10) S(7,13) S(8,12) S(2,10) S(3,11) S(4,6) S(7,9)
S(1,3) S(2,8) S(5,11) S(6,7) S(10,12) S(1,4) S(2,6) S(3,5) S(7,11) S(8,10)
S(9,12) S(2,4) S(3,6) S(5,8) S(7,10) S(9,11) S(3,4) S(5,6) S(7,8) S(9,10)
S(6,7) break;
case 15: S(1,2) S(3,10) S(4,14) S(5,8) S(6,13) S(7,12) S(9,11) S(0,14) S(1,5) S(2,8)
S(3,7) S(6,9) S(10,12) S(11,13) S(0,7) S(1,6) S(2,9) S(4,10) S(5,11) S(8,13)
S(12,14) S(0,6) S(2,4) S(3,5) S(7,11) S(8,10) S(9,12) S(13,14) S(0,3) S(1,2)
S(4,7) S(5,9) S(6,8) S(10,11) S(12,13) S(0,1) S(2,3) S(4,6) S(7,9) S(10,12)
S(11,13) S(1,2) S(3,5) S(8,10) S(11,12) S(3,4) S(5,6) S(7,8) S(9,10) S(2,3)
S(4,5) S(6,7) S(8,9) S(10,11) S(5,6) S(7,8) break;
case 16: S(0,13) S(1,12) S(2,15) S(3,14) S(4,8) S(5,6) S(7,11) S(9,10) S(0,5) S(1,7)
S(2,9) S(3,4) S(6,13) S(8,14) S(10,15) S(11,12) S(0,1) S(2,3) S(4,5) S(6,8)
S(7,9) S(10,11) S(12,13) S(14,15) S(0,2) S(1,3) S(4,10) S(5,11) S(6,7) S(8,9)
S(12,14) S(13,15) S(1,2) S(3,12) S(4,6) S(5,7) S(8,10) S(9,11) S(13,14) S(1,4)
S(2,6) S(5,8) S(7,10) S(9,13) S(11,14) S(2,4) S(3,6) S(9,12) S(11,13) S(3,5)
S(6,8) S(7,9) S(10,12) S(3,4) S(5,6) S(7,8) S(9,10) S(11,12) S(6,7) S(8,9) break;
default: __builtin_unreachable();
}
#undef S
}
template <typename T, int N> void sort(T* a) {
sort(a, N);
}
template void sort<int, 4>(int* a);
template void sort<float, 4>(float* a);
//template void sort<int>(int* a, int n);
//template void sort<float>(float* a, int n);
void sort<int, 4>(int*):
mov eax, dword ptr [rdi + 8]
mov ecx, dword ptr [rdi]
mov r9d, dword ptr [rdi + 12]
mov edx, dword ptr [rdi + 4]
cmp ecx, eax
mov esi, eax
mov r8d, r9d
cmovg esi, ecx
cmovl eax, ecx
cmp edx, r9d
cmovl r9d, edx
cmovg r8d, edx
cmp eax, r9d
mov edx, r9d
cmovl r9d, eax
cmovg edx, eax
cmp esi, r8d
mov eax, r8d
cmovl r8d, esi
cmovg eax, esi
mov dword ptr [rdi], r9d
cmp edx, r8d
mov dword ptr [rdi + 12], eax
mov eax, r8d
cmovl r8d, edx
cmovg eax, edx
mov dword ptr [rdi + 4], r8d
mov dword ptr [rdi + 8], eax
ret
void sort<float, 4>(float*):
vmovss xmm0, dword ptr [rdi + 8]
vmovss xmm1, dword ptr [rdi]
vmovss xmm2, dword ptr [rdi + 4]
vmaxss xmm3, xmm1, xmm0
vminss xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rdi + 12]
vmaxss xmm4, xmm2, xmm1
vminss xmm1, xmm1, xmm2
vmaxss xmm2, xmm0, xmm1
vminss xmm0, xmm1, xmm0
vminss xmm1, xmm4, xmm3
vmaxss xmm5, xmm3, xmm4
vmaxss xmm3, xmm2, xmm1
vminss xmm1, xmm1, xmm2
vmovss dword ptr [rdi], xmm0
vmovss dword ptr [rdi + 12], xmm5
vmovss dword ptr [rdi + 4], xmm1
vmovss dword ptr [rdi + 8], xmm3
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment