Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active February 5, 2025 09:50
Show Gist options
  • Save vurtun/fa49e27e2408a365822f6aaa3b4f3a03 to your computer and use it in GitHub Desktop.
Save vurtun/fa49e27e2408a365822f6aaa3b4f3a03 to your computer and use it in GitHub Desktop.
#include <immintrin.h> // For AVX2
#define MAX_SIZE (64*1024)
#define BITS_PER_WORD 64
#define WORD_CNT ((MAX_SIZE + BITS_PER_WORD - 1) / BITS_PER_WORD) // 1024 words
#define BIT_MAP_CNT ((WORD_CNT + BITS_PER_WORD - 1) / BITS_PER_WORD) // 16 words
#define WORD_SHIFT 6 // log2(BITS_PER_WORD)
#define WORD_MSK (BITS_PER_WORD - 1) // 63
#define CHUNK_CNT (BIT_MAP_CNT/4)
#define CACHE_LINE_SIZE 64
static struct my_data data[MAX_SIZE];
static unsigned long long data_used[WORD_CNT] __attribute__((aligned(CACHE_LINE_SIZE)));
static unsigned long long data_used_top[BIT_MAP_CNT] __attribute__((aligned(CACHE_LINE_SIZE)));
static inline int
bit_ffz() {
const __m256i* ptr = (__m256i*)data_used_top;
int chnk_idx = 0;
unsigned long long msk = 0;
while (chnk_idx < CHUNK_CNT &&
!(msk = ~_mm256_movemask_pd(_mm256_castsi256_pd(ptr[chnk_idx])))) {
chnk_idx++;
}
if (chnk_idx == CHUNK_CNT) {
return -1; // All bits are set
}
int qidx = __builtin_ctzll(msk);
int grp_idx = (chnk_idx << 2) + qidx;
unsigned long long word = ~data_used_top[grp_idx];
int widx = (grp_idx << 6) | __builtin_ctzll(word);
unsigned long long target_word = ~data_used[widx];
return (widx << 6) | __builtin_ctzll(target_word);
}
static inline void
bit_set(int idx) {
unsigned widx = idx >> WORD_SHIFT; // Divide by 64
unsigned bidx = idx & WORD_MSK; // Modulo 64
unsigned long long *word_ptr = &data_used[widx];
unsigned long long msk = 1ULL << bidx;
unsigned long long old_value = *word_ptr;
unsigned long long new_value = old_value | msk;
*word_ptr = new_value;
unsigned top_idx = widx >> WORD_SHIFT; // Divide by 64 again
unsigned top_bit = widx & WORD_MSK; // Modulo 64 again
unsigned long long top_msk = 1ULL << top_bit;
unsigned long long is_full = -(new_value == ~0ULL);
data_used_top[top_idx] ^= ((data_used_top[top_idx] ^ is_full) & top_msk);
}
static inline void
bit_clr(int idx) {
unsigned int widx = idx >> WORD_SHIFT;
unsigned long long msk = ~(1ULL << (idx & WORD_MSK));
unsigned long long* word_ptr = &data_used[widx];
unsigned long long new_value = *word_ptr & msk;
*word_ptr = new_value;
unsigned int top_idx = widx >> 6;
unsigned long long top_msk = 1ULL << (widx & WORD_MSK);
unsigned long long is_not_empty = -(new_value != 0);
unsigned long long* top_ptr = &data_used_top[top_idx];
*top_ptr ^= (*top_ptr ^ is_not_empty) & top_msk;
}
static inline int
bit_is_set(int idx) {
unsigned widx = idx >> WORD_SHIFT; // Divide by 64
unsigned bidx = idx & WORD_MSK; // Modulo 64
return !!(data_used[widx] & (1 << bidx));
}
#if 0
find_first_zero_bit:
mov rax, qword ptr [rip + top_level_bitmap@GOTPCREL]
vmovapd ymm0, ymmword ptr [rax]
vmovmskpd ecx, ymm0
not rcx
rep bsf rcx, rcx
mov rax, qword ptr [rax + 8*rcx]
not rax
shl ecx, 6
rep bsf rdx, rax
or edx, ecx
mov rax, qword ptr [rip + bit_array@GOTPCREL]
mov rax, qword ptr [rax + 8*rdx]
not rax
shl edx, 6
rep bsf rax, rax
or eax, edx
vzeroupper
ret
set_bit:
mov ecx, edi
shr ecx, 6
mov rax, qword ptr [rip + bit_array@GOTPCREL]
mov rdx, qword ptr [rax + 8*rcx]
bts rdx, rdi
mov esi, 1
shl rsi, cl
mov qword ptr [rax + 8*rcx], rdx
xor eax, eax
cmp rdx, -1
sete al
neg rax
shr edi, 12
mov rcx, qword ptr [rip + top_level_bitmap@GOTPCREL]
mov rdx, qword ptr [rcx + 8*rdi]
xor rax, rdx
and rax, rsi
xor rax, rdx
mov qword ptr [rcx + 8*rdi], rax
ret
clear_bit:
mov eax, edi
mov rdx, -2
mov ecx, edi
rol rdx, cl
shr eax, 6
mov rsi, qword ptr [rip + bit_array@GOTPCREL]
mov r8d, 1
mov ecx, eax
shl r8, cl
xor ecx, ecx
and qword ptr [rsi + 8*rax], rdx
setne cl
neg rcx
shr edi, 12
mov rax, qword ptr [rip + top_level_bitmap@GOTPCREL]
mov rdx, qword ptr [rax + 8*rdi]
xor rcx, rdx
and rcx, r8
xor rcx, rdx
mov qword ptr [rax + 8*rdi], rcx
ret
#endif
// init
/* nothing to do here */
// insert store index elsewhere (if neccessary)
int idx = bit_ffz();
bit_set(data_used, idx);
data[idx] = input;
// lookup using stored index
output = data[idx];
// update existing stored index
data[idx] = input;
// remove existing stored element
bit_clr(data_used, idx);
// branchless iteration over all elements
for (int i = 0; i < MAX_SIZE; ++i) {
var = bit_is_set(data_used,i) ? ... : ...;
}
#define MAX_SIZE (64*1024)
static struct my_data data[MAX_SIZE];
static unsigned short free_idx[MAX_SIZE];
static unsigned short free_idx_cnt = MAX_SIZE;
// init; populate indicies in reverse
for (int i = 0; i < MAX_SIZE; ++i) {
free_idx[i] = MAX_SIZE - i - 1;
}
// insert; store index elsewhere
index = free_idx[--free_idx_cnt];
data[index] = ...input;
// lookup using stored index
output = data[idx];
// update existing stored element
data[idx] = input;
// remove used stored element
data[idx].active = 0;
free_idx[free_idx_cnt++] = idx;
// branchless iterate over all elements
for (int i = 0; i < MAX_SIZE; ++i) {
var = data[i].active ? ... : ...;
}
@vurtun
Copy link
Author

vurtun commented Feb 4, 2025

image
Gi6mOKJXoAAaRtP
Gi6mOKIXwAA-57F
Gi6mOKIXYAA4c8u

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment