Last active
March 6, 2025 08:58
-
-
Save CAFxX/9aa833cc4ee9e27107f3272158e236ab to your computer and use it in GitHub Desktop.
Sorting network for short arrays, branchless (CMOVxx, VMINSS/VMAXSS) - https://godbolt.org/z/v4G4xPofc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template <typename T> | |
static void sort(T* a, int l) { | |
#define S(i, j) { \ | |
T t1 = a[i], t2 = a[j]; \ | |
if (t1 > t2) { T t = t1; t1 = t2; t2 = t; } \ | |
a[i] = t1, a[j] = t2;\ | |
} | |
// Sorting networks from https://bertdobbelaere.github.io/sorting_networks.html | |
// Using the ones with lower CEs because every S(...) requires two CMOVxx or a | |
// VMINSS+VMAXSS pair, and it seems that's the limit per cycle on current | |
// processors (e.g. Zen5). | |
switch (l) { | |
case 0: break; | |
case 1: break; | |
case 2: S(0,1) break; | |
case 3: S(0,2) S(0,1) S(1,2) break; | |
case 4: S(0,2) S(1,3) S(0,1) S(2,3) S(1,2) break; | |
case 5: S(0,3) S(1,4) S(0,2) S(1,3) S(0,1) S(2,4) S(1,2) S(3,4) S(2,3) break; | |
case 6: S(0,5) S(1,3) S(2,4) S(1,2) S(3,4) S(0,3) S(2,5) S(0,1) S(2,3) S(4,5) | |
S(1,2) S(3,4) break; | |
case 7: S(0,6) S(2,3) S(4,5) S(0,2) S(1,4) S(3,6) S(0,1) S(2,5) S(3,4) S(1,2) | |
S(4,6) S(2,3) S(4,5) S(1,2) S(3,4) S(5,6) break; | |
case 8: S(0,2) S(1,3) S(4,6) S(5,7) S(0,4) S(1,5) S(2,6) S(3,7) S(0,1) S(2,3) | |
S(4,5) S(6,7) S(2,4) S(3,5) S(1,4) S(3,6) S(1,2) S(3,4) S(5,6) break; | |
case 9: S(0,3) S(1,7) S(2,5) S(4,8) S(0,7) S(2,4) S(3,8) S(5,6) S(0,2) S(1,3) | |
S(4,5) S(7,8) S(1,4) S(3,6) S(5,7) S(0,1) S(2,4) S(3,5) S(6,8) S(2,3) | |
S(4,5) S(6,7) S(1,2) S(3,4) S(5,6) break; | |
case 10: S(0,8) S(1,9) S(2,7) S(3,5) S(4,6) S(0,2) S(1,4) S(5,8) S(7,9) S(0,3) | |
S(0,1) S(2,9) S(3,7) S(4,5) S(8,9) S(2,4) S(5,7) S(6,9) S(0,1) S(3,6) | |
S(8,9) S(1,5) S(2,3) S(4,8) S(6,7) S(1,2) S(3,5) S(4,6) S(7,8) S(2,3) | |
S(4,5) S(6,7) S(3,4) S(5,6) break; | |
case 11: S(0,9) S(1,6) S(2,4) S(3,7) S(5,8) S(0,1) S(3,5) S(4,10) S(6,9) S(7,8) | |
S(1,3) S(2,5) S(4,7) S(8,10) S(0,4) S(1,2) S(3,7) S(5,9) S(6,8) S(0,1) | |
S(2,6) S(4,5) S(7,8) S(9,10) S(2,4) S(3,6) S(5,7) S(8,9) S(1,2) S(3,4) | |
S(5,6) S(7,8) S(2,3) S(4,5) S(6,7) break; | |
case 12: S(0,8) S(1,7) S(2,6) S(3,11) S(4,10) S(5,9) S(0,1) S(2,5) S(3,4) S(6,9) | |
S(7,8) S(10,11) S(0,2) S(1,6) S(5,10) S(9,11) S(0,3) S(1,2) S(4,6) S(5,7) | |
S(8,11) S(9,10) S(1,4) S(3,5) S(6,8) S(7,10) S(1,3) S(2,5) S(6,9) S(8,10) | |
S(2,3) S(4,5) S(6,7) S(8,9) S(4,6) S(5,7) S(3,4) S(5,6) S(7,8) break; | |
case 13: S(0,12) S(1,10) S(2,9) S(3,7) S(5,11) S(6,8) S(1,6) S(2,3) S(4,11) S(7,9) | |
S(8,10) S(0,4) S(1,2) S(3,6) S(7,8) S(9,10) S(11,12) S(4,6) S(5,9) S(8,11) | |
S(10,12) S(0,5) S(3,8) S(4,7) S(6,11) S(9,10) S(0,1) S(2,5) S(6,9) S(7,8) | |
S(10,11) S(1,3) S(2,4) S(5,6) S(9,10) S(1,2) S(3,4) S(5,7) S(6,8) S(2,3) | |
S(4,5) S(6,7) S(8,9) S(3,4) S(5,6) break; | |
case 14: S(0,1) S(2,3) S(4,5) S(6,7) S(8,9) S(10,11) S(12,13) S(0,2) S(1,3) S(4,8) | |
S(5,9) S(10,12) S(11,13) S(0,4) S(1,2) S(3,7) S(5,8) S(6,10) S(9,13) S(11,12) | |
S(0,6) S(1,5) S(3,9) S(4,10) S(7,13) S(8,12) S(2,10) S(3,11) S(4,6) S(7,9) | |
S(1,3) S(2,8) S(5,11) S(6,7) S(10,12) S(1,4) S(2,6) S(3,5) S(7,11) S(8,10) | |
S(9,12) S(2,4) S(3,6) S(5,8) S(7,10) S(9,11) S(3,4) S(5,6) S(7,8) S(9,10) | |
S(6,7) break; | |
case 15: S(1,2) S(3,10) S(4,14) S(5,8) S(6,13) S(7,12) S(9,11) S(0,14) S(1,5) S(2,8) | |
S(3,7) S(6,9) S(10,12) S(11,13) S(0,7) S(1,6) S(2,9) S(4,10) S(5,11) S(8,13) | |
S(12,14) S(0,6) S(2,4) S(3,5) S(7,11) S(8,10) S(9,12) S(13,14) S(0,3) S(1,2) | |
S(4,7) S(5,9) S(6,8) S(10,11) S(12,13) S(0,1) S(2,3) S(4,6) S(7,9) S(10,12) | |
S(11,13) S(1,2) S(3,5) S(8,10) S(11,12) S(3,4) S(5,6) S(7,8) S(9,10) S(2,3) | |
S(4,5) S(6,7) S(8,9) S(10,11) S(5,6) S(7,8) break; | |
case 16: S(0,13) S(1,12) S(2,15) S(3,14) S(4,8) S(5,6) S(7,11) S(9,10) S(0,5) S(1,7) | |
S(2,9) S(3,4) S(6,13) S(8,14) S(10,15) S(11,12) S(0,1) S(2,3) S(4,5) S(6,8) | |
S(7,9) S(10,11) S(12,13) S(14,15) S(0,2) S(1,3) S(4,10) S(5,11) S(6,7) S(8,9) | |
S(12,14) S(13,15) S(1,2) S(3,12) S(4,6) S(5,7) S(8,10) S(9,11) S(13,14) S(1,4) | |
S(2,6) S(5,8) S(7,10) S(9,13) S(11,14) S(2,4) S(3,6) S(9,12) S(11,13) S(3,5) | |
S(6,8) S(7,9) S(10,12) S(3,4) S(5,6) S(7,8) S(9,10) S(11,12) S(6,7) S(8,9) break; | |
default: __builtin_unreachable(); | |
} | |
#undef S | |
} | |
template <typename T, int N> void sort(T* a) { | |
sort(a, N); | |
} | |
template void sort<int, 4>(int* a); | |
template void sort<float, 4>(float* a); | |
//template void sort<int>(int* a, int n); | |
//template void sort<float>(float* a, int n); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void sort<int, 4>(int*): | |
mov eax, dword ptr [rdi + 8] | |
mov ecx, dword ptr [rdi] | |
mov r9d, dword ptr [rdi + 12] | |
mov edx, dword ptr [rdi + 4] | |
cmp ecx, eax | |
mov esi, eax | |
mov r8d, r9d | |
cmovg esi, ecx | |
cmovl eax, ecx | |
cmp edx, r9d | |
cmovl r9d, edx | |
cmovg r8d, edx | |
cmp eax, r9d | |
mov edx, r9d | |
cmovl r9d, eax | |
cmovg edx, eax | |
cmp esi, r8d | |
mov eax, r8d | |
cmovl r8d, esi | |
cmovg eax, esi | |
mov dword ptr [rdi], r9d | |
cmp edx, r8d | |
mov dword ptr [rdi + 12], eax | |
mov eax, r8d | |
cmovl r8d, edx | |
cmovg eax, edx | |
mov dword ptr [rdi + 4], r8d | |
mov dword ptr [rdi + 8], eax | |
ret | |
void sort<float, 4>(float*): | |
vmovss xmm0, dword ptr [rdi + 8] | |
vmovss xmm1, dword ptr [rdi] | |
vmovss xmm2, dword ptr [rdi + 4] | |
vmaxss xmm3, xmm1, xmm0 | |
vminss xmm0, xmm0, xmm1 | |
vmovss xmm1, dword ptr [rdi + 12] | |
vmaxss xmm4, xmm2, xmm1 | |
vminss xmm1, xmm1, xmm2 | |
vmaxss xmm2, xmm0, xmm1 | |
vminss xmm0, xmm1, xmm0 | |
vminss xmm1, xmm4, xmm3 | |
vmaxss xmm5, xmm3, xmm4 | |
vmaxss xmm3, xmm2, xmm1 | |
vminss xmm1, xmm1, xmm2 | |
vmovss dword ptr [rdi], xmm0 | |
vmovss dword ptr [rdi + 12], xmm5 | |
vmovss dword ptr [rdi + 4], xmm1 | |
vmovss dword ptr [rdi + 8], xmm3 | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment