igorburago · July 3, 2025 16:14
diff --git a/partsort.c b/partsort.c
 // Copyright 2024 Igor Burago
 // SPDX-License-Identifier: ISC

 #include <math.h> // for rank_partition()
 #include <stddef.h> // ptrdiff_t
 #include <stdint.h>
 #include <string.h> // memcpy(), memmove(), memset()

 // Built-in u128 is not strictly required, but it makes mcg64 simpler and faster.
 typedef __uint128_t uint128_t;

 #define must_inline     inline __attribute__((always_inline))
 #define must_not_inline __attribute__((noinline))
 #define inline_within   __attribute__((flatten))

 #if __has_builtin(__builtin_unpredictable)
  #define fickle(x) (__builtin_unpredictable((x)))
 #else
  #define fickle(x) (x)
 #endif
 #if __has_builtin(__builtin_expect)
  #define usually(x, c) (__builtin_expect((x), (c)))
 #else
  #define usually(x, c) (x)
 #endif
 #define likely(x)   (usually(!!(x), 1))
 #define unlikely(x) (usually(!!(x), 0))

 #define glue_raw(x, y) x##y
 #define glue(x, y) glue_raw(x, y)
 #define dub(name) glue(name, glue(__dub_L, __LINE__))

 #define apriori(x) extern void *(*compile_time_check_dummy(void))[ \
  sizeof(struct { unsigned compile_time_check : (x) ? 1 : -1; })]

 // #define DEBUG_BUILD
 #if defined(DEBUG_BUILD)
  #include <stdio.h>
  #define surely(x) (likely(x) ? (void)0 : \
    (fprintf(stderr, "FATAL: %s:%d: surely(%s) does not hold!\n", __FILE__, __LINE__, #x), \
      fflush(NULL), \
      __builtin_trap()))
  #define unreachable() surely(0 && "This code path must never be reached.")
 #else
  #if __has_builtin(__builtin_unreachable)
    #define unreachable() (__builtin_unreachable())
  #else
    #define unreachable() ((void)(*(volatile int *)0 = 0))
  #endif
  static must_inline void surely_failure_deadend(void) { for (;;) { unreachable(); } }
  #define surely(x) (likely(x) ? (void)0 : surely_failure_deadend())
 #endif

 #define swap(x, y) do { \
  void *dub(xp) = &(x), *dub(yp) = &(y); \
  apriori(sizeof(x) == sizeof(y)); \
  unsigned char dub(tmp)[sizeof(x)]; \
  memcpy(dub(tmp), dub(xp), sizeof(dub(tmp))); \
  memmove(dub(xp), dub(yp), sizeof(dub(tmp))); \
  memcpy(dub(yp), dub(tmp), sizeof(dub(tmp))); \
 } while (0)

 static inline uint64_t
 splitmix64_next(uint64_t *s) {
  uint64_t x = *s += 0x9E3779B97F4A7C15u;
  x = (x^(x>>30)) * 0xBF58476D1CE4E5B9u;
  x = (x^(x>>27)) * 0x94D049BB133111EBu;
  return x^(x>>31);
 }

 struct pcg32 { uint64_t s, t; };
 static inline struct pcg32
 pcg32(uint64_t seed) {
  uint64_t a = splitmix64_next(&seed);
  uint64_t b = splitmix64_next(&seed);
  return (struct pcg32){.s = a, .t = b|1};
 }
 static inline uint32_t
 pcg32_next(struct pcg32 *r) {
  uint32_t k = r->s >> 59;
  uint32_t x = (r->s ^ (r->s>>18)) >> 27;
  r->s = r->s*0x5851F42D4C957F2Du + r->t;
  return (x >> k) | (x << (-k & 31));
 }

 struct mcg64 { uint128_t s; };
 static inline struct mcg64
 mcg64(uint64_t seed) {
  uint64_t a = splitmix64_next(&seed);
  uint64_t b = splitmix64_next(&seed);
  return (struct mcg64){((uint128_t)a << 64) + b};
 }
 static must_inline uint64_t
 mcg64_next(struct mcg64 *r) {
  return (r->s *= 0xDA942042E4DD58B5u) >> 64;
 }

 // Fast and only slightly biased for large (end-beg) approaching u32/u64 max.
 static must_inline uint32_t
 onto_range_u32(uint32_t x, uint32_t beg, uint32_t end) {
  x = ((uint64_t)x * (end - beg)) >> 32;
  return beg + x;
 }
 static must_inline uint64_t
 onto_range_u64(uint64_t x, uint64_t beg, uint64_t end) {
  x = ((uint128_t)x * (end - beg)) >> 64;
  return beg + x;
 }

 static must_inline uint64_t
 load_u64(void *p, size_t size) {
  union { unsigned char b[sizeof(uint64_t)]; uint64_t u; } x = {.u = 0};
  memcpy(x.b, p, size <= sizeof(uint64_t) ? size : sizeof(uint64_t));
  return x.u;
 }


 //-----------------------------------------------------------------------------*
 // ARRAY TYPE PARAMETERS
 //-----------------------------------------------------------------------------*

 typedef int item_t;
 #define item_less(x, y) ((x) < (y))

 // Both signed and unsigned types are supported:
 typedef uint32_t index_t;

 #if 1
  #define INDEX_RNG_USING_PCG
 #endif
 #if defined(INDEX_RNG_USING_PCG)
  typedef struct pcg32 index_rng_t;
  #define index_rng(seed) (pcg32(seed))
  #define index_rand(rng, len) (onto_range_u32(pcg32_next(rng), 0, (len)))
 #else
  // MCG is faster than LCG or PCG, and is usually good enough for quicksort:
  typedef struct mcg64 index_rng_t;
  #define index_rng(seed) (mcg64(seed))
  #define index_rand(rng, len) (onto_range_u64(mcg64_next(rng), 0, (len)))
 #endif


 //-----------------------------------------------------------------------------*
 // OPTION 1: One-off partial quicksort (no state shared between calls)
 //-----------------------------------------------------------------------------*

 #define max(x, y) ((x) > (y) ? (x) : (y))
 enum {
  QUICKSORT_CUTOFF_HARD = 4,
  QUICKSORT_CUTOFF_SOFT = max(8, 64/sizeof(item_t)),
 };
 apriori(QUICKSORT_CUTOFF_HARD >= 3); // We use the median-of-three pivoting.

 // Insertion sort.
 static inline void
 quicksort_cutoff_sort(item_t *arr, index_t len) {
  surely(len >= 0);
  for (index_t i = (len != 0); i != len; i++) {
    if fickle(!item_less(arr[i], arr[i-1])) { continue; }
    item_t x = arr[i];
    index_t j = i;
    do { arr[j] = arr[j-1]; } while (--j != 0 && fickle(item_less(x, arr[j-1])));
    arr[j] = x;
  }
 }
 // Hoare-partition elements in [0,len[ using the item at pivot_idx for pivot.
 static must_inline index_t
 partition(item_t *arr, index_t len, index_t pivot_idx) {
  surely(len >= 2), surely(0 <= pivot_idx), surely(pivot_idx < len);

  item_t piv = arr[pivot_idx];
  arr[pivot_idx] = arr[0];
  pivot_idx = item_less(arr[len-1], piv) ? len-1 : 0;
  arr[0] = arr[pivot_idx];
  arr[pivot_idx] = piv;

  index_t i = 0, j = len-1;
  for (;;) {
    do { i++; } while (item_less(arr[i], piv));
    do { j--; } while (item_less(piv, arr[j]));
    if (i >= j) { break; }
    swap(arr[i], arr[j]);
  }
  j += (pivot_idx != 0);
  swap(arr[j], arr[pivot_idx]);
  return j;
 }
 static must_inline index_t
 median_rand_3(index_rng_t *rng, item_t *arr, index_t len) {
  surely(len >= 3);

  index_t i[3];
  i[0] = index_rand(rng, len-0);
  i[1] = index_rand(rng, len-1);
  i[2] = index_rand(rng, len-2);
  unsigned i1_past_i0 = (i[1] >= i[0]);
  i[1] += i1_past_i0; // Count i[1]-th item, jumping over i[0]-th.
  i[2] += (i[2] >= i[!i1_past_i0]); // Count i[2]-th, skipping min(i[0],i[1])-th...
  i[2] += (i[2] >= i[i1_past_i0]); // ...and then max(i[0],i[1])-th.

  unsigned gt01 = !!item_less(arr[i[1]], arr[i[0]]);
  unsigned lt02 = !!item_less(arr[i[0]], arr[i[2]]);
  unsigned lt12 = !!item_less(arr[i[1]], arr[i[2]]);
  return i[(gt01^lt02) + (lt02^lt12)];
 }
 // Partially quick-sort elements from [l,r[ so that those whose ranks in
 // the sorted sequence belong to [lo,hi[ take their correct positions.
 static void
 partial_quicksort_rand_rec(index_rng_t *rng, item_t *arr,
    index_t lo, index_t hi, index_t l, index_t r) {
  for (;;) {
    if unlikely(r-l <= QUICKSORT_CUTOFF_HARD ||
        (r-l <= QUICKSORT_CUTOFF_SOFT && lo <= l && r <= hi)) {
      quicksort_cutoff_sort(arr+l, r-l);
      return;
    }
    index_t pivot_idx = median_rand_3(rng, arr+l, r-l);
    index_t r_left = l + partition(arr+l, r-l, pivot_idx);
    index_t l_right = r_left + 1;
    unsigned go_left  = (lo < r_left  && r_left-l  > 1);
    unsigned go_right = (l_right < hi && r-l_right > 1);

    switch fickle((go_left<<1) | go_right) {
      case 0: { return; }
      case 1: right: { l = l_right; break; }
      case 2: left:  { r = r_left;  break; }
      case 3: {
        if (r_left-l <= r-l_right) {
          partial_quicksort_rand_rec(rng, arr, lo, hi, l, r_left);
          goto right;
        } else {
          partial_quicksort_rand_rec(rng, arr, lo, hi, l_right, r);
          goto left;
        }
      }
      default: { unreachable(); }
    }
  }
 }
 static inline void
 partial_quicksort_rand(item_t *arr, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
  if (len <= 1 || lo >= hi) { return; }

  index_rng_t rng = index_rng((uintptr_t)arr);
  index_t i = index_rand(&rng, len);
  rng = index_rng((uintptr_t)arr + load_u64(arr+i, (len-i)*sizeof(arr[0])));

  partial_quicksort_rand_rec(&rng, arr, lo, hi, 0, len);
 }


 //-----------------------------------------------------------------------------*
 // OPTION 2: Incremental quicksort (pivot indices are shared between calls)
 //-----------------------------------------------------------------------------*

 // Partially quick-sort elements from [l,r[ so that those whose ranks in the
 // sorted sequence belong to [lo,hi[ end up in their correct positions, taking
 // into account the history of partitions from previous invocations.
 //
 // The pivot indices of past partitions are stored in the part_idx array in
 // the order of preorder traversal of the quicksort tree. Special index values
 // SORTED and CHAOS signify that an interval is, respectively, fully sorted and
 // has not been partitioned yet (all part_idx must initially be set to CHAOS).
 #define INC_QUICKSORT_CHAOS ((index_t)-1)
 #define INC_QUICKSORT_SORTED ((index_t)-2)
 static void
 inc_quicksort_rand_rec(index_rng_t *rng, item_t *arr, index_t lo, index_t hi,
    index_t l, index_t r, index_t *part_idx) {
  for (;;) {
    int will_be_sorted = (lo <= l && r <= hi);
    if unlikely(r-l <= QUICKSORT_CUTOFF_HARD ||
        (r-l <= QUICKSORT_CUTOFF_SOFT && will_be_sorted)) {
      quicksort_cutoff_sort(arr+l, r-l);
      *part_idx = INC_QUICKSORT_SORTED;
      return;
    }
    index_t r_left = *part_idx;
    if likely(r_left == INC_QUICKSORT_CHAOS) {
      index_t pivot_idx = median_rand_3(rng, arr+l, r-l);
      r_left = l + partition(arr+l, r-l, pivot_idx);
    }
    *part_idx = will_be_sorted ? INC_QUICKSORT_SORTED : r_left;
    index_t l_right = r_left + 1;

    unsigned go_left  = (lo < r_left  && r_left-l  > 1);
    unsigned go_right = (l_right < hi && r-l_right > 1);
    index_t *part_idx_left  = part_idx + 1;
    index_t *part_idx_right = part_idx + (l_right - l);
    go_left  = (go_left  && *part_idx_left  != INC_QUICKSORT_SORTED);
    go_right = (go_right && *part_idx_right != INC_QUICKSORT_SORTED);

    switch fickle((go_left<<1) | go_right) {
      case 0: { return; }
      case 1: right: { l = l_right, part_idx = part_idx_right; break; }
      case 2: left:  { r = r_left,  part_idx = part_idx_left;  break; }
      case 3: {
        if (r_left-l <= r-l_right) {
          inc_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, part_idx_left);
          goto right;
        } else {
          inc_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, part_idx_right);
          goto left;
        }
      }
      default: { unreachable(); }
    }
  }
 }
 static inline void
 inc_quicksort_rand(item_t *arr, index_t *part_idxs, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
  if (len <= 1 || lo >= hi || *part_idxs == INC_QUICKSORT_SORTED) { return; }

  index_rng_t rng = index_rng((uintptr_t)arr);
  index_t i = index_rand(&rng, len);
  rng = index_rng((uintptr_t)arr + load_u64(arr+i, (len-i)*sizeof(arr[0])));

  inc_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, part_idxs);
 }


 //-----------------------------------------------------------------------------*
 // OPTION 3: Floyd-Rivest partitioning + N largest scan using binary heap
 //-----------------------------------------------------------------------------*

 // Binary heap implementation. Supports both min- and max-heaps, as well as
 // both left-to-right and right-to-left storage layout in memory.
 #define heap_superior(x, y) ((min_heap) ? item_less((x), (y)) : item_less((y), (x)))
 #define heap_at(i) (*((stored_backwards) ? (root)-(i) : (root)+(i)))
 static inline void
 heap_fix_down(int min_heap, int stored_backwards, item_t *root, index_t node, index_t len) {
  surely(len > 0), surely(0 <= node), surely(node < len);
  item_t x = heap_at(node);
  for (index_t child; child = (node<<1)|1, child < len; node = child) {
    if likely(child+1 != len) {
      child += !!heap_superior(heap_at(child+1), heap_at(child));
    }
    if fickle(!heap_superior(heap_at(child), x)) { break; }
    heap_at(node) = heap_at(child);
  }
  heap_at(node) = x;
 }
 #undef heap_superior
 #undef heap_at
 static inline_within void
 heap_build(int min_heap, int stored_backwards, item_t *root, index_t len) {
  surely(len >= 0);
  for (index_t i = len/2; i != 0; i--) {
    heap_fix_down(min_heap, stored_backwards, root, i-1, len);
  }
 }
 #define lr_max_heap_fix_down(root, node, len) (heap_fix_down(0,0, (root), (node), (len)))
 #define rl_min_heap_fix_down(root, node, len) (heap_fix_down(1,1, (root), (node), (len)))
 #define lr_max_heap_build(root, len) (heap_build(0,0, (root), (len)))
 #define rl_min_heap_build(root, len) (heap_build(1,1, (root), (len)))

 // Use Floyd-Rivest Select algorithm to partition elements in [beg,end[ using
 // the pivot_rank-th largest element in that interval (counting from zero) for
 // the pivot, which hence ends up at exactly the pivot_rank index in the array
 // when partitioning completes.
 //
 // - Floyd, R. W., Rivest, R. L. (1975). Expected time bounds for selection.
 //   Communications of the ACM, 18(3), 165-172.
 // - Kiwiel, K. C. (2005). On Floyd and Rivest's SELECT algorithm.
 //   Theoretical Computer Science, 347(1-2), 214-238.
 static void
 rank_partition(item_t *arr, index_t beg, index_t end, index_t pivot_rank) {
  enum { CUTOFF_LEN = 600 }; // At most 4 levels of recursion for 32-bit ranges.
  surely(beg <= end), surely(beg <= pivot_rank), surely(pivot_rank < end);
  ptrdiff_t l = beg, r = end-1, k = pivot_rank;
  while (l < r) {
    ptrdiff_t n = r - l + 1;
    ptrdiff_t m = k - l + 1;
    if (n > CUTOFF_LEN) {
      double log_n = log(n);
      ptrdiff_t s = (ptrdiff_t)exp(log_n*(2/3.0)) / 2;
      ptrdiff_t g = (1 - 2*(m < n/2)) * (ptrdiff_t)sqrt(log_n*(n-s)*s/n) / 2;
      ptrdiff_t l_sample = k + g - m*s/n;
      ptrdiff_t r_sample = k + g + (n-m)*s/n;
      if unlikely(l_sample < l) { l_sample = l; }
      if unlikely(r_sample > r) { r_sample = r; }
      rank_partition(arr, l_sample, r_sample+1, k);
    }
    ptrdiff_t p = l + partition(arr+l, n, m-1);
    if fickle(k <= p) { r = p-1; }
    if fickle(p <= k) { l = p+1; }
  }
 }
 // Partially heap-sort elements from [0,len[ so that those whose ranks in
 // the sorted sequence belong to [lo,hi[ take their correct positions.
 //
 // If the requested sorting interval is closer to the beginning of the array:
 //
 // (A1) Partition the array around the (hi-1)-st largest element as a pivot.
 // (A2) Build a right-to-left min-heap out of elements in [lo,hi[ so that
 //      the root node holding the minimum value is stored at index hi-1.
 // (A3) For each element in [0,lo[, if it is exceeding the current heap
 //      minimum, exchange the former with the latter, fixing the heap.
 // (A4) Fill [lo,hi[ going left to right with successively removed heap
 //      minimums, simultaneously shortening (and maintaining) the heap.
 //
 // Otherwise, if the interval is closer to the end of the array:
 //
 // (B1) Partition the array around the lo-th largest element as a pivot.
 // (B2) Build a left-to-right max-heap out of elements in [lo,hi[ so that
 //      the root node holding the maximum value is stored at index lo.
 // (B3) For each element in [hi,len[, if it is inferior to the current heap
 //      maximum, exchange the former with the latter, fixing the heap.
 // (B4) Fill [lo,hi[ going right to left with successively removed heap
 //      maximums, simultaneously shortening (and maintaining) the heap.
 static inline void
 rank_partitioned_heapsort(item_t *arr, index_t len, index_t lo, index_t hi) {
  surely(len >= 0), surely(0 <= lo), surely(lo < hi), surely(hi <= len);
  if (len <= 1 || lo >= hi) { return; }

  if (lo < len-hi) {
    rank_partition(arr, 0, len, hi-1);

    index_t heap_len = hi - lo;
    item_t *heap_root = arr + hi - 1;
    rl_min_heap_build(heap_root, heap_len);

    for (index_t i = 0; i != lo; i++) {
      if (!item_less(*heap_root, arr[i])) { continue; }
      swap(arr[i], *heap_root);
      rl_min_heap_fix_down(heap_root, 0, heap_len);
    }

    for (index_t i = lo; i != hi-1; i++) {
      swap(arr[i], *heap_root);
      rl_min_heap_fix_down(heap_root, 0, --heap_len);
    }
  } else {
    if likely(lo != 0) { rank_partition(arr, 0, len, lo); }

    index_t heap_len = hi - lo;
    item_t *heap_root = arr + lo;
    lr_max_heap_build(heap_root, heap_len);

    for (index_t i = hi; i != len; i++) {
      if (!item_less(arr[i], *heap_root)) { continue; }
      swap(arr[i], *heap_root);
      lr_max_heap_fix_down(heap_root, 0, heap_len);
    }

    for (index_t i = hi-1; i != lo; i--) {
      swap(arr[i], *heap_root);
      lr_max_heap_fix_down(heap_root, 0, --heap_len);
    }
  }
 }


 //-----------------------------------------------------------------------------*
 // BENCHMARKS
 //-----------------------------------------------------------------------------*

 #include <stdio.h> // printf()
 #include <stdlib.h> // calloc(), free()
 #include <time.h> // clock_gettime()

 static inline uint64_t
 nsec(void) {
  struct timespec t;
  clock_gettime(CLOCK_MONOTONIC_RAW, &t);
  return (uint64_t)t.tv_sec*1000*1000*1000 + (uint64_t)t.tv_nsec;
 }

 static inline void
 shuffle(struct pcg32 *r, item_t *arr, uint32_t n) {
  for (uint32_t i = 0; i < n; i++) {
    swap(arr[i], arr[onto_range_u32(pcg32_next(r), i, n)]);
  }
 }

 static inline void
 call_partial_quicksort_rand(item_t *arr, index_t *part_idxs, index_t len, index_t lo, index_t hi) {
  (void)part_idxs;
  partial_quicksort_rand(arr, len, lo, hi);
 }
 static inline void
 call_rank_partitioned_heapsort(item_t *arr, index_t *part_idxs, index_t len, index_t lo, index_t hi) {
  (void)part_idxs;
  rank_partitioned_heapsort(arr, len, lo, hi);
 }

 extern int
 main(void) {
  enum {
    RANDOMIZE = 0,
    VERBOSE = 0,
  };
  typedef void partial_sort_test_func(item_t *, index_t *, index_t, index_t, index_t);
  static struct test_func {
    partial_sort_test_func *func;
    char const *name;
  } const funcs[] = {
    #define X(p,f) {glue(p,f), #f}
    X(call_,rank_partitioned_heapsort),
    X(call_,partial_quicksort_rand),
    X(,inc_quicksort_rand),
    #undef X
  }; enum { N_FUNCS = sizeof(funcs)/sizeof(funcs[0]) };
  static struct test_params {
    int n_runs;
    int n_sorts;
    index_t arr_len;
    index_t min_sort_len;
    index_t max_sort_len;
  } const tests[] = {
    {100, 5, 100, 5, 20},
    {7000, 1, 100*1000, 100, 1000},
    {500, 100, 100*1000, 100, 1000},
    {30, 500, 1000*1000, 100, 5000},
    {8, 2000, 1000*1000, 100, 5000},
    {4, 500, 10*1000*1000, 100, 10*1000},
    {4, 25, 100*1000*1000, 100, 10*1000},
  }; enum { N_TESTS = sizeof(tests)/sizeof(tests[0]) };

  index_t max_arr_len = 0;
  for (int i = 0; i < N_TESTS; i++) {
    if (max_arr_len < tests[i].arr_len) {
      max_arr_len = tests[i].arr_len;
    }
  }
  item_t *arr = calloc(max_arr_len, sizeof(arr[0]));
  index_t *part_idxs = calloc(max_arr_len, sizeof(part_idxs[0]));
  surely(arr != NULL), surely(part_idxs != NULL);
  if (!arr || !part_idxs) { return 1; }

  uint64_t const rng_seed = RANDOMIZE ? nsec() : 809;

  for (int i_func = 0; i_func < N_FUNCS; i_func++) {
    partial_sort_test_func *const partial_sort = funcs[i_func].func;
    printf("\nFunction %s\n", funcs[i_func].name);

    struct pcg32 rng = pcg32(rng_seed);

    for (int i_test = 0; i_test < N_TESTS; i_test++) {
      struct test_params const T = tests[i_test];
      if (VERBOSE) {
        printf("Test %3d. Parameters: %d x %d, %td, %td..%td\n",
          i_test+1, T.n_runs, T.n_sorts, (ptrdiff_t)T.arr_len,
          (ptrdiff_t)T.min_sort_len, (ptrdiff_t)T.max_sort_len);
      }

      uint64_t test_dt = 0;
      for (int i_run = 0; i_run < T.n_runs; i_run++) {
        // Initialize the array to a random permutation (of its indices).
        for (index_t i = 0; i < T.arr_len; i++) { arr[i] = (item_t)i; }
        shuffle(&rng, arr, T.arr_len);
        // Set pivot indices to INC_QUICKSORT_CHAOS for inc_quicksort_rand():
        apriori(INC_QUICKSORT_CHAOS == (index_t)-1);
        memset(part_idxs, 0xFF, T.arr_len*sizeof(part_idxs[0]));

        uint64_t run_dt = 0;
        for (int i_sort = 0; i_sort < T.n_sorts; i_sort++) {
          index_t span = onto_range_u32(pcg32_next(&rng), T.min_sort_len, T.max_sort_len+1);
          index_t lo = onto_range_u32(pcg32_next(&rng), 0, T.arr_len-span+1);
          index_t hi = lo + span;

          uint64_t t0 = nsec();
          (*partial_sort)(arr, part_idxs, T.arr_len, lo, hi);
          uint64_t dt = nsec() - t0;
          run_dt += dt;

          int ok = 1; index_t first_diff = 0;
          for (index_t i = lo; i < hi; i++) {
            if (arr[i] != (item_t)i) {
              ok = 0, first_diff = i;
              break;
            }
          }
          if (VERBOSE || !ok) {
            printf("Test %3d, run %3d, sort %3d. [%12td %12td] (%6td): %4s %.9f\n",
              i_test+1, i_run+1, i_sort+1, (ptrdiff_t)lo, (ptrdiff_t)hi-1, (ptrdiff_t)span,
              ok ? "OK" : "FAIL", dt*1e-9/span);
          }
          if (!ok) {
            enum { BEFORE = 5, AFTER = 10 };
            index_t l = first_diff-lo >= BEFORE ? first_diff-BEFORE : lo;
            index_t r = hi-first_diff >= AFTER+1 ? first_diff+AFTER+1 : hi;
            printf("  [%td..%td]:", (ptrdiff_t)l, (ptrdiff_t)r-1);
            for (index_t i = l; i < r; i++) {
              printf(" %llu", (unsigned long long)arr[i]);
            }
            printf("\n");
          }
        }
        test_dt += run_dt;

        if (VERBOSE) {
          printf("Test %3d, run %3d. Elapsed: %9.6f\n",
            i_test+1, i_run+1, run_dt*1e-9);
        }
      }

      printf("Test %3d. Elapsed: %7.3f total, %9.6f per run\n",
        i_test+1, test_dt*1e-9, test_dt*1e-9/T.n_runs);
      fflush(stdout);
    }
  }

  free(arr), arr = NULL;
  free(part_idxs), part_idxs = NULL;

  return 0;
 }
diff --git a/partsort_output.md b/partsort_output.md
diff --git a/§LICENSE b/§LICENSE
 ISC License

 Copyright 2024 Igor Burago

 Permission to use, copy, modify, and/or distribute this software for any
 purpose with or without fee is hereby granted, provided that the above
 copyright notice and this permission notice appear in all copies.

 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	// Copyright 2024 Igor Burago
	// SPDX-License-Identifier: ISC

	#include <math.h> // for rank_partition()
	#include <stddef.h> // ptrdiff_t
	#include <stdint.h>
	#include <string.h> // memcpy(), memmove(), memset()

	// Built-in u128 is not strictly required, but it makes mcg64 simpler and faster.
	typedef __uint128_t uint128_t;

	#define must_inline inline __attribute__((always_inline))
	#define must_not_inline __attribute__((noinline))
	#define inline_within __attribute__((flatten))

	#if __has_builtin(__builtin_unpredictable)
	#define fickle(x) (__builtin_unpredictable((x)))
	#else
	#define fickle(x) (x)
	#endif
	#if __has_builtin(__builtin_expect)
	#define usually(x, c) (__builtin_expect((x), (c)))
	#else
	#define usually(x, c) (x)
	#endif
	#define likely(x) (usually(!!(x), 1))
	#define unlikely(x) (usually(!!(x), 0))

	#define glue_raw(x, y) x##y
	#define glue(x, y) glue_raw(x, y)
	#define dub(name) glue(name, glue(__dub_L, __LINE__))

	#define apriori(x) extern void (compile_time_check_dummy(void))[ \
	sizeof(struct { unsigned compile_time_check : (x) ? 1 : -1; })]

	// #define DEBUG_BUILD
	#if defined(DEBUG_BUILD)
	#include <stdio.h>
	#define surely(x) (likely(x) ? (void)0 : \
	(fprintf(stderr, "FATAL: %s:%d: surely(%s) does not hold!\n", __FILE__, __LINE__, #x), \
	fflush(NULL), \
	__builtin_trap()))
	#define unreachable() surely(0 && "This code path must never be reached.")
	#else
	#if __has_builtin(__builtin_unreachable)
	#define unreachable() (__builtin_unreachable())
	#else
	#define unreachable() ((void)((volatile int )0 = 0))
	#endif
	static must_inline void surely_failure_deadend(void) { for (;;) { unreachable(); } }
	#define surely(x) (likely(x) ? (void)0 : surely_failure_deadend())
	#endif

	#define swap(x, y) do { \
	void dub(xp) = &(x), dub(yp) = &(y); \
	apriori(sizeof(x) == sizeof(y)); \
	unsigned char dub(tmp)[sizeof(x)]; \
	memcpy(dub(tmp), dub(xp), sizeof(dub(tmp))); \
	memmove(dub(xp), dub(yp), sizeof(dub(tmp))); \
	memcpy(dub(yp), dub(tmp), sizeof(dub(tmp))); \
	} while (0)

	static inline uint64_t
	splitmix64_next(uint64_t *s) {
	uint64_t x = *s += 0x9E3779B97F4A7C15u;
	x = (x^(x>>30)) * 0xBF58476D1CE4E5B9u;
	x = (x^(x>>27)) * 0x94D049BB133111EBu;
	return x^(x>>31);
	}

	struct pcg32 { uint64_t s, t; };
	static inline struct pcg32
	pcg32(uint64_t seed) {
	uint64_t a = splitmix64_next(&seed);
	uint64_t b = splitmix64_next(&seed);
	return (struct pcg32){.s = a, .t = b\|1};
	}
	static inline uint32_t
	pcg32_next(struct pcg32 *r) {
	uint32_t k = r->s >> 59;
	uint32_t x = (r->s ^ (r->s>>18)) >> 27;
	r->s = r->s*0x5851F42D4C957F2Du + r->t;
	return (x >> k) \| (x << (-k & 31));
	}

	struct mcg64 { uint128_t s; };
	static inline struct mcg64
	mcg64(uint64_t seed) {
	uint64_t a = splitmix64_next(&seed);
	uint64_t b = splitmix64_next(&seed);
	return (struct mcg64){((uint128_t)a << 64) + b};
	}
	static must_inline uint64_t
	mcg64_next(struct mcg64 *r) {
	return (r->s *= 0xDA942042E4DD58B5u) >> 64;
	}

	// Fast and only slightly biased for large (end-beg) approaching u32/u64 max.
	static must_inline uint32_t
	onto_range_u32(uint32_t x, uint32_t beg, uint32_t end) {
	x = ((uint64_t)x * (end - beg)) >> 32;
	return beg + x;
	}
	static must_inline uint64_t
	onto_range_u64(uint64_t x, uint64_t beg, uint64_t end) {
	x = ((uint128_t)x * (end - beg)) >> 64;
	return beg + x;
	}

	static must_inline uint64_t
	load_u64(void *p, size_t size) {
	union { unsigned char b[sizeof(uint64_t)]; uint64_t u; } x = {.u = 0};
	memcpy(x.b, p, size <= sizeof(uint64_t) ? size : sizeof(uint64_t));
	return x.u;
	}


	//-----------------------------------------------------------------------------*
	// ARRAY TYPE PARAMETERS
	//-----------------------------------------------------------------------------*

	typedef int item_t;
	#define item_less(x, y) ((x) < (y))

	// Both signed and unsigned types are supported:
	typedef uint32_t index_t;

	#if 1
	#define INDEX_RNG_USING_PCG
	#endif
	#if defined(INDEX_RNG_USING_PCG)
	typedef struct pcg32 index_rng_t;
	#define index_rng(seed) (pcg32(seed))
	#define index_rand(rng, len) (onto_range_u32(pcg32_next(rng), 0, (len)))
	#else
	// MCG is faster than LCG or PCG, and is usually good enough for quicksort:
	typedef struct mcg64 index_rng_t;
	#define index_rng(seed) (mcg64(seed))
	#define index_rand(rng, len) (onto_range_u64(mcg64_next(rng), 0, (len)))
	#endif


	//-----------------------------------------------------------------------------*
	// OPTION 1: One-off partial quicksort (no state shared between calls)
	//-----------------------------------------------------------------------------*

	#define max(x, y) ((x) > (y) ? (x) : (y))
	enum {
	QUICKSORT_CUTOFF_HARD = 4,
	QUICKSORT_CUTOFF_SOFT = max(8, 64/sizeof(item_t)),
	};
	apriori(QUICKSORT_CUTOFF_HARD >= 3); // We use the median-of-three pivoting.

	// Insertion sort.
	static inline void
	quicksort_cutoff_sort(item_t *arr, index_t len) {
	surely(len >= 0);
	for (index_t i = (len != 0); i != len; i++) {
	if fickle(!item_less(arr[i], arr[i-1])) { continue; }
	item_t x = arr[i];
	index_t j = i;
	do { arr[j] = arr[j-1]; } while (--j != 0 && fickle(item_less(x, arr[j-1])));
	arr[j] = x;
	}
	}
	// Hoare-partition elements in [0,len[ using the item at pivot_idx for pivot.
	static must_inline index_t
	partition(item_t *arr, index_t len, index_t pivot_idx) {
	surely(len >= 2), surely(0 <= pivot_idx), surely(pivot_idx < len);

	item_t piv = arr[pivot_idx];
	arr[pivot_idx] = arr[0];
	pivot_idx = item_less(arr[len-1], piv) ? len-1 : 0;
	arr[0] = arr[pivot_idx];
	arr[pivot_idx] = piv;

	index_t i = 0, j = len-1;
	for (;;) {
	do { i++; } while (item_less(arr[i], piv));
	do { j--; } while (item_less(piv, arr[j]));
	if (i >= j) { break; }
	swap(arr[i], arr[j]);
	}
	j += (pivot_idx != 0);
	swap(arr[j], arr[pivot_idx]);
	return j;
	}
	static must_inline index_t
	median_rand_3(index_rng_t rng, item_t arr, index_t len) {
	surely(len >= 3);

	index_t i[3];
	i[0] = index_rand(rng, len-0);
	i[1] = index_rand(rng, len-1);
	i[2] = index_rand(rng, len-2);
	unsigned i1_past_i0 = (i[1] >= i[0]);
	i[1] += i1_past_i0; // Count i[1]-th item, jumping over i[0]-th.
	i[2] += (i[2] >= i[!i1_past_i0]); // Count i[2]-th, skipping min(i[0],i[1])-th...
	i[2] += (i[2] >= i[i1_past_i0]); // ...and then max(i[0],i[1])-th.

	unsigned gt01 = !!item_less(arr[i[1]], arr[i[0]]);
	unsigned lt02 = !!item_less(arr[i[0]], arr[i[2]]);
	unsigned lt12 = !!item_less(arr[i[1]], arr[i[2]]);
	return i[(gt01^lt02) + (lt02^lt12)];
	}
	// Partially quick-sort elements from [l,r[ so that those whose ranks in
	// the sorted sequence belong to [lo,hi[ take their correct positions.
	static void
	partial_quicksort_rand_rec(index_rng_t rng, item_t arr,
	index_t lo, index_t hi, index_t l, index_t r) {
	for (;;) {
	if unlikely(r-l <= QUICKSORT_CUTOFF_HARD \|\|
	(r-l <= QUICKSORT_CUTOFF_SOFT && lo <= l && r <= hi)) {
	quicksort_cutoff_sort(arr+l, r-l);
	return;
	}
	index_t pivot_idx = median_rand_3(rng, arr+l, r-l);
	index_t r_left = l + partition(arr+l, r-l, pivot_idx);
	index_t l_right = r_left + 1;
	unsigned go_left = (lo < r_left && r_left-l > 1);
	unsigned go_right = (l_right < hi && r-l_right > 1);

	switch fickle((go_left<<1) \| go_right) {
	case 0: { return; }
	case 1: right: { l = l_right; break; }
	case 2: left: { r = r_left; break; }
	case 3: {
	if (r_left-l <= r-l_right) {
	partial_quicksort_rand_rec(rng, arr, lo, hi, l, r_left);
	goto right;
	} else {
	partial_quicksort_rand_rec(rng, arr, lo, hi, l_right, r);
	goto left;
	}
	}
	default: { unreachable(); }
	}
	}
	}
	static inline void
	partial_quicksort_rand(item_t *arr, index_t len, index_t lo, index_t hi) {
	surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
	if (len <= 1 \|\| lo >= hi) { return; }

	index_rng_t rng = index_rng((uintptr_t)arr);
	index_t i = index_rand(&rng, len);
	rng = index_rng((uintptr_t)arr + load_u64(arr+i, (len-i)*sizeof(arr[0])));

	partial_quicksort_rand_rec(&rng, arr, lo, hi, 0, len);
	}


	//-----------------------------------------------------------------------------*
	// OPTION 2: Incremental quicksort (pivot indices are shared between calls)
	//-----------------------------------------------------------------------------*

	// Partially quick-sort elements from [l,r[ so that those whose ranks in the
	// sorted sequence belong to [lo,hi[ end up in their correct positions, taking
	// into account the history of partitions from previous invocations.
	//
	// The pivot indices of past partitions are stored in the part_idx array in
	// the order of preorder traversal of the quicksort tree. Special index values
	// SORTED and CHAOS signify that an interval is, respectively, fully sorted and
	// has not been partitioned yet (all part_idx must initially be set to CHAOS).
	#define INC_QUICKSORT_CHAOS ((index_t)-1)
	#define INC_QUICKSORT_SORTED ((index_t)-2)
	static void
	inc_quicksort_rand_rec(index_rng_t rng, item_t arr, index_t lo, index_t hi,
	index_t l, index_t r, index_t *part_idx) {
	for (;;) {
	int will_be_sorted = (lo <= l && r <= hi);
	if unlikely(r-l <= QUICKSORT_CUTOFF_HARD \|\|
	(r-l <= QUICKSORT_CUTOFF_SOFT && will_be_sorted)) {
	quicksort_cutoff_sort(arr+l, r-l);
	*part_idx = INC_QUICKSORT_SORTED;
	return;
	}
	index_t r_left = *part_idx;
	if likely(r_left == INC_QUICKSORT_CHAOS) {
	index_t pivot_idx = median_rand_3(rng, arr+l, r-l);
	r_left = l + partition(arr+l, r-l, pivot_idx);
	}
	*part_idx = will_be_sorted ? INC_QUICKSORT_SORTED : r_left;
	index_t l_right = r_left + 1;

	unsigned go_left = (lo < r_left && r_left-l > 1);
	unsigned go_right = (l_right < hi && r-l_right > 1);
	index_t *part_idx_left = part_idx + 1;
	index_t *part_idx_right = part_idx + (l_right - l);
	go_left = (go_left && *part_idx_left != INC_QUICKSORT_SORTED);
	go_right = (go_right && *part_idx_right != INC_QUICKSORT_SORTED);

	switch fickle((go_left<<1) \| go_right) {
	case 0: { return; }
	case 1: right: { l = l_right, part_idx = part_idx_right; break; }
	case 2: left: { r = r_left, part_idx = part_idx_left; break; }
	case 3: {
	if (r_left-l <= r-l_right) {
	inc_quicksort_rand_rec(rng, arr, lo, hi, l, r_left, part_idx_left);
	goto right;
	} else {
	inc_quicksort_rand_rec(rng, arr, lo, hi, l_right, r, part_idx_right);
	goto left;
	}
	}
	default: { unreachable(); }
	}
	}
	}
	static inline void
	inc_quicksort_rand(item_t arr, index_t part_idxs, index_t len, index_t lo, index_t hi) {
	surely(len >= 0), surely(0 <= lo), surely(lo <= hi), surely(hi <= len);
	if (len <= 1 \|\| lo >= hi \|\| *part_idxs == INC_QUICKSORT_SORTED) { return; }

	index_rng_t rng = index_rng((uintptr_t)arr);
	index_t i = index_rand(&rng, len);
	rng = index_rng((uintptr_t)arr + load_u64(arr+i, (len-i)*sizeof(arr[0])));

	inc_quicksort_rand_rec(&rng, arr, lo, hi, 0, len, part_idxs);
	}


	//-----------------------------------------------------------------------------*
	// OPTION 3: Floyd-Rivest partitioning + N largest scan using binary heap
	//-----------------------------------------------------------------------------*

	// Binary heap implementation. Supports both min- and max-heaps, as well as
	// both left-to-right and right-to-left storage layout in memory.
	#define heap_superior(x, y) ((min_heap) ? item_less((x), (y)) : item_less((y), (x)))
	#define heap_at(i) (*((stored_backwards) ? (root)-(i) : (root)+(i)))
	static inline void
	heap_fix_down(int min_heap, int stored_backwards, item_t *root, index_t node, index_t len) {
	surely(len > 0), surely(0 <= node), surely(node < len);
	item_t x = heap_at(node);
	for (index_t child; child = (node<<1)\|1, child < len; node = child) {
	if likely(child+1 != len) {
	child += !!heap_superior(heap_at(child+1), heap_at(child));
	}
	if fickle(!heap_superior(heap_at(child), x)) { break; }
	heap_at(node) = heap_at(child);
	}
	heap_at(node) = x;
	}
	#undef heap_superior
	#undef heap_at
	static inline_within void
	heap_build(int min_heap, int stored_backwards, item_t *root, index_t len) {
	surely(len >= 0);
	for (index_t i = len/2; i != 0; i--) {
	heap_fix_down(min_heap, stored_backwards, root, i-1, len);
	}
	}
	#define lr_max_heap_fix_down(root, node, len) (heap_fix_down(0,0, (root), (node), (len)))
	#define rl_min_heap_fix_down(root, node, len) (heap_fix_down(1,1, (root), (node), (len)))
	#define lr_max_heap_build(root, len) (heap_build(0,0, (root), (len)))
	#define rl_min_heap_build(root, len) (heap_build(1,1, (root), (len)))

	// Use Floyd-Rivest Select algorithm to partition elements in [beg,end[ using
	// the pivot_rank-th largest element in that interval (counting from zero) for
	// the pivot, which hence ends up at exactly the pivot_rank index in the array
	// when partitioning completes.
	//
	// - Floyd, R. W., Rivest, R. L. (1975). Expected time bounds for selection.
	// Communications of the ACM, 18(3), 165-172.
	// - Kiwiel, K. C. (2005). On Floyd and Rivest's SELECT algorithm.
	// Theoretical Computer Science, 347(1-2), 214-238.
	static void
	rank_partition(item_t *arr, index_t beg, index_t end, index_t pivot_rank) {
	enum { CUTOFF_LEN = 600 }; // At most 4 levels of recursion for 32-bit ranges.
	surely(beg <= end), surely(beg <= pivot_rank), surely(pivot_rank < end);
	ptrdiff_t l = beg, r = end-1, k = pivot_rank;
	while (l < r) {
	ptrdiff_t n = r - l + 1;
	ptrdiff_t m = k - l + 1;
	if (n > CUTOFF_LEN) {
	double log_n = log(n);
	ptrdiff_t s = (ptrdiff_t)exp(log_n*(2/3.0)) / 2;
	ptrdiff_t g = (1 - 2(m < n/2)) (ptrdiff_t)sqrt(log_n(n-s)s/n) / 2;
	ptrdiff_t l_sample = k + g - m*s/n;
	ptrdiff_t r_sample = k + g + (n-m)*s/n;
	if unlikely(l_sample < l) { l_sample = l; }
	if unlikely(r_sample > r) { r_sample = r; }
	rank_partition(arr, l_sample, r_sample+1, k);
	}
	ptrdiff_t p = l + partition(arr+l, n, m-1);
	if fickle(k <= p) { r = p-1; }
	if fickle(p <= k) { l = p+1; }
	}
	}
	// Partially heap-sort elements from [0,len[ so that those whose ranks in
	// the sorted sequence belong to [lo,hi[ take their correct positions.
	//
	// If the requested sorting interval is closer to the beginning of the array:
	//
	// (A1) Partition the array around the (hi-1)-st largest element as a pivot.
	// (A2) Build a right-to-left min-heap out of elements in [lo,hi[ so that
	// the root node holding the minimum value is stored at index hi-1.
	// (A3) For each element in [0,lo[, if it is exceeding the current heap
	// minimum, exchange the former with the latter, fixing the heap.
	// (A4) Fill [lo,hi[ going left to right with successively removed heap
	// minimums, simultaneously shortening (and maintaining) the heap.
	//
	// Otherwise, if the interval is closer to the end of the array:
	//
	// (B1) Partition the array around the lo-th largest element as a pivot.
	// (B2) Build a left-to-right max-heap out of elements in [lo,hi[ so that
	// the root node holding the maximum value is stored at index lo.
	// (B3) For each element in [hi,len[, if it is inferior to the current heap
	// maximum, exchange the former with the latter, fixing the heap.
	// (B4) Fill [lo,hi[ going right to left with successively removed heap
	// maximums, simultaneously shortening (and maintaining) the heap.
	static inline void
	rank_partitioned_heapsort(item_t *arr, index_t len, index_t lo, index_t hi) {
	surely(len >= 0), surely(0 <= lo), surely(lo < hi), surely(hi <= len);
	if (len <= 1 \|\| lo >= hi) { return; }

	if (lo < len-hi) {
	rank_partition(arr, 0, len, hi-1);

	index_t heap_len = hi - lo;
	item_t *heap_root = arr + hi - 1;
	rl_min_heap_build(heap_root, heap_len);

	for (index_t i = 0; i != lo; i++) {
	if (!item_less(*heap_root, arr[i])) { continue; }
	swap(arr[i], *heap_root);
	rl_min_heap_fix_down(heap_root, 0, heap_len);
	}

	for (index_t i = lo; i != hi-1; i++) {
	swap(arr[i], *heap_root);
	rl_min_heap_fix_down(heap_root, 0, --heap_len);
	}
	} else {
	if likely(lo != 0) { rank_partition(arr, 0, len, lo); }

	index_t heap_len = hi - lo;
	item_t *heap_root = arr + lo;
	lr_max_heap_build(heap_root, heap_len);

	for (index_t i = hi; i != len; i++) {
	if (!item_less(arr[i], *heap_root)) { continue; }
	swap(arr[i], *heap_root);
	lr_max_heap_fix_down(heap_root, 0, heap_len);
	}

	for (index_t i = hi-1; i != lo; i--) {
	swap(arr[i], *heap_root);
	lr_max_heap_fix_down(heap_root, 0, --heap_len);
	}
	}
	}


	//-----------------------------------------------------------------------------*
	// BENCHMARKS
	//-----------------------------------------------------------------------------*

	#include <stdio.h> // printf()
	#include <stdlib.h> // calloc(), free()
	#include <time.h> // clock_gettime()

	static inline uint64_t
	nsec(void) {
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC_RAW, &t);
	return (uint64_t)t.tv_sec10001000*1000 + (uint64_t)t.tv_nsec;
	}

	static inline void
	shuffle(struct pcg32 r, item_t arr, uint32_t n) {
	for (uint32_t i = 0; i < n; i++) {
	swap(arr[i], arr[onto_range_u32(pcg32_next(r), i, n)]);
	}
	}

	static inline void
	call_partial_quicksort_rand(item_t arr, index_t part_idxs, index_t len, index_t lo, index_t hi) {
	(void)part_idxs;
	partial_quicksort_rand(arr, len, lo, hi);
	}
	static inline void
	call_rank_partitioned_heapsort(item_t arr, index_t part_idxs, index_t len, index_t lo, index_t hi) {
	(void)part_idxs;
	rank_partitioned_heapsort(arr, len, lo, hi);
	}

	extern int
	main(void) {
	enum {
	RANDOMIZE = 0,
	VERBOSE = 0,
	};
	typedef void partial_sort_test_func(item_t , index_t , index_t, index_t, index_t);
	static struct test_func {
	partial_sort_test_func *func;
	char const *name;
	} const funcs[] = {
	#define X(p,f) {glue(p,f), #f}
	X(call_,rank_partitioned_heapsort),
	X(call_,partial_quicksort_rand),
	X(,inc_quicksort_rand),
	#undef X
	}; enum { N_FUNCS = sizeof(funcs)/sizeof(funcs[0]) };
	static struct test_params {
	int n_runs;
	int n_sorts;
	index_t arr_len;
	index_t min_sort_len;
	index_t max_sort_len;
	} const tests[] = {
	{100, 5, 100, 5, 20},
	{7000, 1, 100*1000, 100, 1000},
	{500, 100, 100*1000, 100, 1000},
	{30, 500, 1000*1000, 100, 5000},
	{8, 2000, 1000*1000, 100, 5000},
	{4, 500, 1010001000, 100, 10*1000},
	{4, 25, 10010001000, 100, 10*1000},
	}; enum { N_TESTS = sizeof(tests)/sizeof(tests[0]) };

	index_t max_arr_len = 0;
	for (int i = 0; i < N_TESTS; i++) {
	if (max_arr_len < tests[i].arr_len) {
	max_arr_len = tests[i].arr_len;
	}
	}
	item_t *arr = calloc(max_arr_len, sizeof(arr[0]));
	index_t *part_idxs = calloc(max_arr_len, sizeof(part_idxs[0]));
	surely(arr != NULL), surely(part_idxs != NULL);
	if (!arr \|\| !part_idxs) { return 1; }

	uint64_t const rng_seed = RANDOMIZE ? nsec() : 809;

	for (int i_func = 0; i_func < N_FUNCS; i_func++) {
	partial_sort_test_func *const partial_sort = funcs[i_func].func;
	printf("\nFunction %s\n", funcs[i_func].name);

	struct pcg32 rng = pcg32(rng_seed);

	for (int i_test = 0; i_test < N_TESTS; i_test++) {
	struct test_params const T = tests[i_test];
	if (VERBOSE) {
	printf("Test %3d. Parameters: %d x %d, %td, %td..%td\n",
	i_test+1, T.n_runs, T.n_sorts, (ptrdiff_t)T.arr_len,
	(ptrdiff_t)T.min_sort_len, (ptrdiff_t)T.max_sort_len);
	}

	uint64_t test_dt = 0;
	for (int i_run = 0; i_run < T.n_runs; i_run++) {
	// Initialize the array to a random permutation (of its indices).
	for (index_t i = 0; i < T.arr_len; i++) { arr[i] = (item_t)i; }
	shuffle(&rng, arr, T.arr_len);
	// Set pivot indices to INC_QUICKSORT_CHAOS for inc_quicksort_rand():
	apriori(INC_QUICKSORT_CHAOS == (index_t)-1);
	memset(part_idxs, 0xFF, T.arr_len*sizeof(part_idxs[0]));

	uint64_t run_dt = 0;
	for (int i_sort = 0; i_sort < T.n_sorts; i_sort++) {
	index_t span = onto_range_u32(pcg32_next(&rng), T.min_sort_len, T.max_sort_len+1);
	index_t lo = onto_range_u32(pcg32_next(&rng), 0, T.arr_len-span+1);
	index_t hi = lo + span;

	uint64_t t0 = nsec();
	(*partial_sort)(arr, part_idxs, T.arr_len, lo, hi);
	uint64_t dt = nsec() - t0;
	run_dt += dt;

	int ok = 1; index_t first_diff = 0;
	for (index_t i = lo; i < hi; i++) {
	if (arr[i] != (item_t)i) {
	ok = 0, first_diff = i;
	break;
	}
	}
	if (VERBOSE \|\| !ok) {
	printf("Test %3d, run %3d, sort %3d. [%12td %12td] (%6td): %4s %.9f\n",
	i_test+1, i_run+1, i_sort+1, (ptrdiff_t)lo, (ptrdiff_t)hi-1, (ptrdiff_t)span,
	ok ? "OK" : "FAIL", dt*1e-9/span);
	}
	if (!ok) {
	enum { BEFORE = 5, AFTER = 10 };
	index_t l = first_diff-lo >= BEFORE ? first_diff-BEFORE : lo;
	index_t r = hi-first_diff >= AFTER+1 ? first_diff+AFTER+1 : hi;
	printf(" [%td..%td]:", (ptrdiff_t)l, (ptrdiff_t)r-1);
	for (index_t i = l; i < r; i++) {
	printf(" %llu", (unsigned long long)arr[i]);
	}
	printf("\n");
	}
	}
	test_dt += run_dt;

	if (VERBOSE) {
	printf("Test %3d, run %3d. Elapsed: %9.6f\n",
	i_test+1, i_run+1, run_dt*1e-9);
	}
	}

	printf("Test %3d. Elapsed: %7.3f total, %9.6f per run\n",
	i_test+1, test_dt1e-9, test_dt1e-9/T.n_runs);
	fflush(stdout);
	}
	}

	free(arr), arr = NULL;
	free(part_idxs), part_idxs = NULL;

	return 0;
	}
	ISC License

	Copyright 2024 Igor Burago

	Permission to use, copy, modify, and/or distribute this software for any
	purpose with or without fee is hereby granted, provided that the above
	copyright notice and this permission notice appear in all copies.

	THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.