Last active
May 23, 2024 18:34
-
-
Save vurtun/7063afddcf1491af16037a207a167e49 to your computer and use it in GitHub Desktop.
print K largest(or smallest) elements in an array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Problem: Directory file view ui. Files can be sorted by different properties (name, size, ...). Ui itself only needs elements | |
that are currently in the visible section of the list view from x and count k. So I want to use a fixed size buffer with up to k elements | |
and walk through all files in the directory and only have the final elements in the end in the fixed size buffer. | |
Temp buffers are fine for me as long as they have a fixed at compile time size. For those familiar with SQL basically this is | |
a SORT BY LIMIT begin, count. | |
Idea: Use Floyd–Rivest algorithm with average O(n) to find the element at index x. Walk over list again and skip all elements smaller than x | |
then use heap to sort for the k smallest elements afterwards. So we have for Floyd–Rivest and heap combined O(n*log(k)) | |
Problem: Find x is not directly possible since not all elements are in memory because we have only fixed size buffer. | |
Idea: Create a buffer of size 2𝑘. Read in 2𝑘 elements. Use Floyd–Rivest algorithm to partition the buffer | |
so that the 𝑘 smallest elements are first; this takes 𝑂(𝑘) time. Now read in another 𝑘 items from your array into the | |
buffer, replacing the 𝑘 largest items in the buffer, partition the buffer as before, and repeat. | |
Problem: k should be fixed size. So when we have k=256 we are not able to address any index greater than 256. | |
Idea: Multiple iterations in multiple of fixed k. So when k=256 then have index 256 initally and iterate over all n. Then | |
new iteration we skip all elements smaller than the element at k=256 and use a new index. Repeat until we hit the index | |
we want. | |
Problem: Kind of expensive. We get O(2*k * (n*0.5)/k) + k*log(k) | |
Optimizations: | |
- initially on index 0 or end page use heap with heapify up/down to sort first k elements directly | |
- Instead of k being size of visible section use bigger window and only recalculate when reaching the end | |
- Jumps don't happen to often (shortcuts + shift-scrollbar-click) so reusing the elements for the | |
starting/ending element makes sense | |
- File count smaller than k only neeed O(k*log(k)) | |
- start/end indicies are limited to n/2 by walking backwards | |
*/ | |
// https://cs.stackexchange.com/questions/68125/finding-kth-smallest-element-from-a-given-sequence-only-with-ok-memory-on-t | |
// https://en.wikipedia.org/wiki/Floyd%E2%80%93Rivest_algorithm | |
// https://gist.github.com/igorburago/fa340429e75007a41335fb660d419e52 | |
#include <math.h> | |
#define heap_parent(i) (((i)-1) >> 1) | |
#define heap_right(i) (((i)+1) << 1) | |
#define heap_left(i) (heap_right(i)-1) | |
#define sgn(val) ((0 < val) - (val < 0)) | |
#define max(x,y) ((x<y)?y:x) | |
#define min(x,y) ((x<y)?x:y) | |
static inline void | |
swpi(void *a, void *b) { | |
int t = *(int*)a; | |
*(int*)a = *(int*)b; | |
*(int*)b = t; | |
} | |
static inline int | |
cmpid(const void *a, const void *b) { | |
return *(const int*)b - *(const int*)a; | |
} | |
static inline int | |
cmpiu(const void *a, const void *b) { | |
return *(const int*)a - *(const int*)b; | |
} | |
static inline void | |
heapifyd(int *a, int i, int n, | |
int(*cmp)(const void*, const void*), | |
void(*swp)(void *a, void *b)) { | |
while (n) { | |
int lo, l = heap_left(i); | |
int r = heap_right(i); | |
lo = (l < n && cmp(&a[l], &a[i]) < 0) ? l: i; | |
lo = (r < n && cmp(&a[r], &a[lo]) < 0) ? r: lo; | |
if (lo == i) { | |
return; | |
} | |
swp(&a[i], &a[lo]); | |
i = lo; | |
} | |
} | |
static inline void | |
heapifyu(int *a, int i, | |
int(*cmp)(const void*, const void*), | |
void(*swp)(void *a, void *b)) { | |
while (i && cmp(&a[heap_parent(i)], &a[i]) > 0) { | |
int p = heap_parent(i); | |
swp(&a[p], &a[i]); | |
i = p; | |
} | |
} | |
static int | |
partition(int a[], int l, int r, int k, | |
int(*cmp)(const void*, const void*), | |
void(*swp)(void *a, void *b)) { | |
while (r > l) { | |
if (r - l > 600) { | |
int n = r - l + 1; | |
int i = k - l + 1; | |
float z = logf(n); | |
float s = 0.5 * expf(2 * z / 3); | |
float sd = 0.5 * sqrtf(z * s * (n - s) / n) * sgn(i - n / 2); | |
int l2 = max(l, (int)(k - i * s / n + sd)); | |
int r2 = min(r, (int)(k + (n - i) * s / n + sd)); | |
partition(a, l2, r2, k, cmp, swp); | |
} | |
int t = a[k]; | |
int i = l; | |
int j = r; | |
swp(&a[l], &a[k]); | |
if (cmp(&a[r],&t) > 0) { | |
swp(&a[l], &a[r]); | |
} | |
while (i < j) { | |
swp(&a[i], &a[j]); | |
i++, j--; | |
for (;cmp(&a[i],&t)<0; ++i); | |
for (;cmp(&a[j],&t)>0; --j); | |
} | |
if (cmp(&a[l],&t) == 0) { | |
swp(&a[l], &a[j]); | |
} else { | |
swp(&a[r], &a[++j]); | |
} | |
if (j <= k) { | |
l = j + 1; | |
} | |
if (k <= j) { | |
r = j - 1; | |
} | |
} | |
return a[k]; | |
} | |
#include <stdio.h> | |
#include <stdlib.h> | |
extern int | |
main(void) { | |
int a[] = {1000, 400, 300, -10, 20, -50, 65, 47, 45, 50, 40, 75, 60, 100}; | |
int n = sizeof(a) / sizeof(a[0]); | |
int off = 3; | |
int k = 3; | |
int i = 0; | |
partition(a, 0, n-1, off, cmpiu, swpi); | |
for (int j = 0; j < n; ++j) { | |
printf("%d,", a[j]); | |
} | |
printf("\n"); | |
int lst[k], cnt = 0; | |
for (i = off; i < off + k; ++i) { | |
lst[cnt++] = a[i]; | |
heapifyu(lst, cnt-1, cmpid, swpi); | |
} | |
for (; i < n; ++i) { | |
if (a[i] < lst[0]) { | |
lst[0] = a[i]; | |
heapifyd(lst, 0, cnt, cmpid, swpi); | |
} | |
} | |
int res[k], num = 0; | |
while (cnt) { | |
res[k - ++num] = lst[0]; | |
lst[0] = lst[--cnt]; | |
heapifyd(lst, 0, cnt, cmpid, swpi); | |
} | |
printf("Smallest values: ["); | |
for (int i = 0; i < num; ++i) { | |
printf("%d,", res[i]); | |
} | |
printf("]\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@vurtun, unless the array in question is queried for subinterval sorting only a couple of times for the same content or is very close to being sorted from the get-go, I would use partial quicksort instead — especially since it can easily be made incremental by reusing the partitioning work from previous calls on the array. This is, of course, assuming we can afford additional linear space in the form of a scratch buffer for storing pivot indices of completed partitions in the quicksort tree.
For a large number of repetitive calls (even with consecutively nonintersecting intervals), this latter approach is generally more performant than K-largest scans with a heap, especially when the target subinterval is far from either end of the array.
Here is my implementation of each of the algorithms together with some benchmarks for comparison:
https://gist.github.com/igorburago/fa340429e75007a41335fb660d419e52.