Last active
October 6, 2020 12:51
-
-
Save amonakov/477f7370aa1fe51e77409f6f4a9f9dbe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SPDX-License-Identifier: MIT | |
// Copyright 2018 Alexander Monakov <[email protected]> | |
/* This implements a sort function suitable for GCC use cases: | |
- signature-compatible to C qsort, but with relaxed contract: | |
- may apply the comparator to elements moved to a temporary buffer | |
- may abort on allocation failure | |
- deterministic (not stable, but allows a stable variant at low cost) | |
- fast, especially for common cases (0-5 elements of size 8 or 4) | |
The implementation uses a sorting network for up to 5 elements and | |
a merge sort on top of that. Neither stage has branches depending on | |
comparator result, trading extra arithmetic for branch mispredictions. */ | |
#include <stdlib.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <string.h> | |
#ifdef __GNUC__ | |
#define likely(cond) __builtin_expect((cond), 1) | |
#define noinline __attribute__((__noinline__)) | |
#else | |
#define likely(cond) (cond) | |
#define noinline | |
#endif | |
typedef int cmp_fn(const void *, const void *); | |
/* Structure holding read-mostly (read-only in netsort) context. */ | |
struct sort_ctx { | |
cmp_fn *cmp; // pointer to comparator | |
char *out; // output buffer | |
size_t n; // number of elements | |
size_t size; // element size | |
}; | |
/* Helper for netsort. Permute, possibly in-place, 2 or 3 elements, | |
placing E0 to C->OUT, E1 to C->OUT + C->SIZE, and so on. */ | |
static void reorder23(struct sort_ctx *c, char *e0, char *e1, char *e2) | |
{ | |
#define REORDER_23(TYPE, STRIDE, OFFSET) do { \ | |
TYPE t0, t1; \ | |
memcpy(&t0, e0 + OFFSET, sizeof t0); \ | |
memcpy(&t1, e1 + OFFSET, sizeof t1); \ | |
char *out = c->out + OFFSET; \ | |
if (likely(c->n == 3)) \ | |
memmove(out + 2*STRIDE, e2 + OFFSET, sizeof t0); \ | |
memcpy(out, &t0, sizeof t0); out += STRIDE; \ | |
memcpy(out, &t1, sizeof t1); \ | |
} while (0) | |
if (likely(c->size == sizeof(size_t))) | |
REORDER_23(size_t, sizeof(size_t), 0); | |
else if (likely(c->size == sizeof(int))) | |
REORDER_23(int, sizeof(int), 0); | |
else { | |
size_t offset = 0, step = sizeof(size_t); | |
for (; offset + step <= c->size; offset += step) | |
REORDER_23(size_t, c->size, offset); | |
for (; offset < c->size; offset++) | |
REORDER_23(char, c->size, offset); | |
} | |
} | |
/* Like reorder23, but permute 4 or 5 elements. */ | |
static void | |
reorder45(struct sort_ctx *c, char *e0, char *e1, char *e2, char *e3, char *e4) | |
{ | |
#define REORDER_45(TYPE, STRIDE, OFFSET) do { \ | |
TYPE t0, t1, t2, t3; \ | |
memcpy(&t0, e0 + OFFSET, sizeof t0); \ | |
memcpy(&t1, e1 + OFFSET, sizeof t1); \ | |
memcpy(&t2, e2 + OFFSET, sizeof t2); \ | |
memcpy(&t3, e3 + OFFSET, sizeof t3); \ | |
char *out = c->out + OFFSET; \ | |
if (likely(c->n == 5)) \ | |
memmove(out + 4*STRIDE, e4 + OFFSET, sizeof t0); \ | |
memcpy(out, &t0, sizeof t0); out += STRIDE; \ | |
memcpy(out, &t1, sizeof t1); out += STRIDE; \ | |
memcpy(out, &t2, sizeof t2); out += STRIDE; \ | |
memcpy(out, &t3, sizeof t3); \ | |
} while (0) | |
if (likely(c->size == sizeof(size_t))) | |
REORDER_45(size_t, sizeof(size_t), 0); | |
else if (likely(c->size == sizeof(int))) | |
REORDER_45(int, sizeof(int), 0); | |
else { | |
size_t offset = 0, step = sizeof(size_t); | |
for (; offset + step <= c->size; offset += step) | |
REORDER_45(size_t, c->size, offset); | |
for (; offset < c->size; offset++) | |
REORDER_45(char, c->size, offset); | |
} | |
} | |
/* Helper for netsort. Invoke comparator CMP on E0 and E1. | |
Return E0^E1 if E0 compares less than E1, zero otherwise. | |
This is noinline to reduce code growth and confine invocation | |
to a single call site, assisting indirect branch prediction. */ | |
noinline static intptr_t cmp1(char *e0, char *e1, cmp_fn *cmp) | |
{ | |
intptr_t x = (intptr_t) e0 ^ (intptr_t) e1; | |
return x & (cmp(e0, e1) >> 31); | |
} | |
/* Apply a sorting network to 2 to 5 elements from IN, placing them into C->OUT. | |
IN may be equal to C->OUT, in which case elements are sorted in place. | |
For 2 or 3 elements this is a stable sort. */ | |
static void netsort(char *in, struct sort_ctx *c) | |
{ | |
#define CMP(e0, e1) do { \ | |
intptr_t x = cmp1(e1, e0, c->cmp); \ | |
e0 = (char *)((intptr_t)e0 ^ x); \ | |
e1 = (char *)((intptr_t)e1 ^ x); \ | |
} while (0) | |
char *e0 = in, *e1 = e0 + c->size, *e2 = e1 + c->size; | |
CMP(e0, e1); | |
if (likely(c->n == 3)) { | |
CMP(e1, e2); | |
CMP(e0, e1); | |
} | |
if (c->n <= 3) { | |
reorder23(c, e0, e1, e2); | |
return; | |
} | |
char *e3 = e2 + c->size, *e4 = e3 + c->size; | |
if (likely(c->n == 5)) { | |
CMP(e3, e4); | |
CMP(e2, e4); | |
} | |
CMP(e2, e3); | |
if (likely(c->n == 5)) { | |
CMP(e0, e3); | |
CMP(e1, e4); | |
} | |
CMP(e0, e2); | |
CMP(e1, e3); | |
CMP(e1, e2); | |
reorder45(c, e0, e1, e2, e3, e4); | |
} | |
/* Execute merge sort on N elements from IN, placing them into OUT, | |
using TMP as temporary storage if IN is equal to OUT. | |
This is a stable sort if netsort is used only for 2 or 3 elements. */ | |
static void mergesort(char *in, struct sort_ctx *c, size_t n, char *out, char *tmp) | |
{ | |
if (likely(n <= 5)) { | |
c->out = out; | |
c->n = n; | |
netsort(in, c); | |
return; | |
} | |
size_t nl = n / 2, nr = n - nl, sz = nl * c->size; | |
char *mid = in + sz, *r = out + sz, *l = in == out ? tmp : in; | |
/* Sort the right half, outputting to right half of OUT. */ | |
mergesort(mid, c, nr, r, tmp); | |
/* Sort the left half, leaving left half of OUT free. */ | |
mergesort(in, c, nl, l, mid); | |
/* Merge sorted halves given by L, R to [OUT, END). */ | |
#define MERGE_ELTSIZE(SIZE) do { \ | |
intptr_t mr = c->cmp(r, l) >> 31; \ | |
intptr_t lr = (intptr_t)l ^ (intptr_t)r; \ | |
lr = (intptr_t)l ^ (lr & mr); \ | |
out = memcpy(out, (char *)lr, SIZE); \ | |
out += SIZE; \ | |
r += mr & SIZE; \ | |
if (r == out) return; \ | |
l += ~mr & SIZE; \ | |
} while (r != end) | |
if (likely(c->cmp(r, l + (r - out) - c->size) < 0)) { | |
char *end = out + n * c->size; | |
if (sizeof(size_t) == 8 && likely(c->size == 8)) | |
MERGE_ELTSIZE(8); | |
else if (likely(c->size == 4)) | |
MERGE_ELTSIZE(4); | |
else | |
MERGE_ELTSIZE(c->size); | |
} | |
memcpy(out, l, r - out); | |
} | |
void metsort(void *base, size_t n, size_t size, cmp_fn *cmp) | |
{ | |
if (n < 2) | |
return; | |
struct sort_ctx c = {cmp, base, n, size}; | |
_Alignas(max_align_t) char scratch[256]; | |
size_t bufsz = (n / 2) * size; | |
void *buf = bufsz <= sizeof scratch ? scratch : malloc(bufsz); | |
if (!buf) abort(); | |
mergesort(base, &c, n, base, buf); | |
if (buf != scratch) | |
free(buf); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment