Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active May 23, 2024 18:34
Show Gist options
  • Save vurtun/7063afddcf1491af16037a207a167e49 to your computer and use it in GitHub Desktop.
Save vurtun/7063afddcf1491af16037a207a167e49 to your computer and use it in GitHub Desktop.
print K largest(or smallest) elements in an array
/* Problem: Directory file view ui. Files can be sorted by different properties (name, size, ...). Ui itself only needs elements
that are currently in the visible section of the list view from x and count k. So I want to use a fixed size buffer with up to k elements
and walk through all files in the directory and only have the final elements in the end in the fixed size buffer.
Temp buffers are fine for me as long as they have a fixed at compile time size. For those familiar with SQL basically this is
a SORT BY LIMIT begin, count.
Idea: Use Floyd–Rivest algorithm with average O(n) to find the element at index x. Walk over list again and skip all elements smaller than x
then use heap to sort for the k smallest elements afterwards. So we have for Floyd–Rivest and heap combined O(n*log(k))
Problem: Find x is not directly possible since not all elements are in memory because we have only fixed size buffer.
Idea: Create a buffer of size 2𝑘. Read in 2𝑘 elements. Use Floyd–Rivest algorithm to partition the buffer
so that the 𝑘 smallest elements are first; this takes 𝑂(𝑘) time. Now read in another 𝑘 items from your array into the
buffer, replacing the 𝑘 largest items in the buffer, partition the buffer as before, and repeat.
Problem: k should be fixed size. So when we have k=256 we are not able to address any index greater than 256.
Idea: Multiple iterations in multiple of fixed k. So when k=256 then have index 256 initally and iterate over all n. Then
new iteration we skip all elements smaller than the element at k=256 and use a new index. Repeat until we hit the index
we want.
Problem: Kind of expensive. We get O(2*k * (n*0.5)/k) + k*log(k)
Optimizations:
- initially on index 0 or end page use heap with heapify up/down to sort first k elements directly
- Instead of k being size of visible section use bigger window and only recalculate when reaching the end
- Jumps don't happen to often (shortcuts + shift-scrollbar-click) so reusing the elements for the
starting/ending element makes sense
- File count smaller than k only neeed O(k*log(k))
- start/end indicies are limited to n/2 by walking backwards
*/
// https://cs.stackexchange.com/questions/68125/finding-kth-smallest-element-from-a-given-sequence-only-with-ok-memory-on-t
// https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm
// https://gist.github.com/igorburago/fa340429e75007a41335fb660d419e52
#include <math.h>
#define heap_parent(i) (((i)-1) >> 1)
#define heap_right(i) (((i)+1) << 1)
#define heap_left(i) (heap_right(i)-1)
#define sgn(val) ((0 < val) - (val < 0))
#define max(x,y) ((x<y)?y:x)
#define min(x,y) ((x<y)?x:y)
static inline void
swpi(void *a, void *b) {
int t = *(int*)a;
*(int*)a = *(int*)b;
*(int*)b = t;
}
static inline int
cmpid(const void *a, const void *b) {
return *(const int*)b - *(const int*)a;
}
static inline int
cmpiu(const void *a, const void *b) {
return *(const int*)a - *(const int*)b;
}
static inline void
heapifyd(int *a, int i, int n,
int(*cmp)(const void*, const void*),
void(*swp)(void *a, void *b)) {
while (n) {
int lo, l = heap_left(i);
int r = heap_right(i);
lo = (l < n && cmp(&a[l], &a[i]) < 0) ? l: i;
lo = (r < n && cmp(&a[r], &a[lo]) < 0) ? r: lo;
if (lo == i) {
return;
}
swp(&a[i], &a[lo]);
i = lo;
}
}
static inline void
heapifyu(int *a, int i,
int(*cmp)(const void*, const void*),
void(*swp)(void *a, void *b)) {
while (i && cmp(&a[heap_parent(i)], &a[i]) > 0) {
int p = heap_parent(i);
swp(&a[p], &a[i]);
i = p;
}
}
static int
partition(int a[], int l, int r, int k,
int(*cmp)(const void*, const void*),
void(*swp)(void *a, void *b)) {
while (r > l) {
if (r - l > 600) {
int n = r - l + 1;
int i = k - l + 1;
float z = logf(n);
float s = 0.5 * expf(2 * z / 3);
float sd = 0.5 * sqrtf(z * s * (n - s) / n) * sgn(i - n / 2);
int l2 = max(l, (int)(k - i * s / n + sd));
int r2 = min(r, (int)(k + (n - i) * s / n + sd));
partition(a, l2, r2, k, cmp, swp);
}
int t = a[k];
int i = l;
int j = r;
swp(&a[l], &a[k]);
if (cmp(&a[r],&t) > 0) {
swp(&a[l], &a[r]);
}
while (i < j) {
swp(&a[i], &a[j]);
i++, j--;
for (;cmp(&a[i],&t)<0; ++i);
for (;cmp(&a[j],&t)>0; --j);
}
if (cmp(&a[l],&t) == 0) {
swp(&a[l], &a[j]);
} else {
swp(&a[r], &a[++j]);
}
if (j <= k) {
l = j + 1;
}
if (k <= j) {
r = j - 1;
}
}
return a[k];
}
#include <stdio.h>
#include <stdlib.h>
extern int
main(void) {
int a[] = {1000, 400, 300, -10, 20, -50, 65, 47, 45, 50, 40, 75, 60, 100};
int n = sizeof(a) / sizeof(a[0]);
int off = 3;
int k = 3;
int i = 0;
partition(a, 0, n-1, off, cmpiu, swpi);
for (int j = 0; j < n; ++j) {
printf("%d,", a[j]);
}
printf("\n");
int lst[k], cnt = 0;
for (i = off; i < off + k; ++i) {
lst[cnt++] = a[i];
heapifyu(lst, cnt-1, cmpid, swpi);
}
for (; i < n; ++i) {
if (a[i] < lst[0]) {
lst[0] = a[i];
heapifyd(lst, 0, cnt, cmpid, swpi);
}
}
int res[k], num = 0;
while (cnt) {
res[k - ++num] = lst[0];
lst[0] = lst[--cnt];
heapifyd(lst, 0, cnt, cmpid, swpi);
}
printf("Smallest values: [");
for (int i = 0; i < num; ++i) {
printf("%d,", res[i]);
}
printf("]\n");
return 0;
}
@igorburago
Copy link

igorburago commented May 23, 2024

@vurtun, unless the array in question is queried for subinterval sorting only a couple of times for the same content or is very close to being sorted from the get-go, I would use partial quicksort instead — especially since it can easily be made incremental by reusing the partitioning work from previous calls on the array. This is, of course, assuming we can afford additional linear space in the form of a scratch buffer for storing pivot indices of completed partitions in the quicksort tree.

For a large number of repetitive calls (even with consecutively nonintersecting intervals), this latter approach is generally more performant than K-largest scans with a heap, especially when the target subinterval is far from either end of the array.

Here is my implementation of each of the algorithms together with some benchmarks for comparison:
https://gist.github.com/igorburago/fa340429e75007a41335fb660d419e52.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment