Skip to content

Instantly share code, notes, and snippets.

@DigitalRedPanda
Last active January 19, 2026 13:04
Show Gist options
  • Select an option

  • Save DigitalRedPanda/ef23322ada0fe8abd40907e21a0bc194 to your computer and use it in GitHub Desktop.

Select an option

Save DigitalRedPanda/ef23322ada0fe8abd40907e21a0bc194 to your computer and use it in GitHub Desktop.
#define DEBUG
#ifndef DEBUG
#define STATIC_INLINE static inline
#else
#define STATIC_INLINE
#endif
typedef struct {
uint64_t length, capacity;
uint8_t *data;
} string;
STATIC_INLINE string *string_new(const char *src, size_t len) {
string *s = malloc(sizeof(string));
s->length = len;
s->capacity = len + 1;
s->data = malloc(len + 1);
memcpy(s->data, src, len);
s->data[len] = '\0';
return s;
}
STATIC_INLINE string *string_empty(size_t len) {
string *s = malloc(sizeof(string));
s->length = len;
s->capacity = len + 1;
s->data = calloc(len + 1, 1);
return s;
}
STATIC_INLINE void string_free(string *a) {
if(a){
free(a->data);
free(a);
}
}
STATIC_INLINE void load_str_m128i_str(const string *str, __m128i *reg) {
char padded_string[16] __attribute__((aligned(16))) = {0};
size_t size_c = str->length;
size_t s_c = size_c > 16 ? 16 : size_c;
memcpy(padded_string, str->data, s_c);
*reg = _mm_loadu_si128((__m128i*) padded_string);
}
STATIC_INLINE void load_str_m128i_str_offset(const string *str, size_t offset, __m128i *reg) {
char padded_string[16] __attribute__((aligned(16))) = {0};
size_t size_c = str->length - offset;
size_t s_c = size_c > 16 ? 16 : size_c;
memcpy(padded_string, str->data + offset, s_c);
*reg = _mm_loadu_si128((__m128i*) padded_string);
}
int contains_str(string *a, string *b) {
size_t size_a = a->length;
size_t size_b = b->length;
uint8_t *a_array = a->data;
__m128i reg_b;
load_str_m128i_str(b, &reg_b);
for (int i = 0; i <= size_a - size_b; i++) {
__m128i *data = (__m128i*) (a_array + i);
__m128i slice = _mm_loadu_si128(data);
int flags = _mm_cmpestrc(slice, size_b, reg_b, 16, _SIDD_CMP_EQUAL_ORDERED);
if(flags & 1) return 1;
}
return 0;
}
STATIC_INLINE void lowercase(string *a) {
int a_len = a->length;
uint8_t* a_array = a->data;
__m128i A_mask = _mm_set1_epi8('A');
__m128i thirty_two = _mm_set1_epi8(32);
__m128i twenty_six_mask = _mm_set1_epi8(26);
__m128i neg_1 = _mm_set1_epi8(-1);
size_t blocks = a->length / 16;
size_t remainder = a->length % 16;
size_t total_blocks = blocks * 16;
for (int i = 0; i < total_blocks; i += 0x10) {
__m128i *data = (__m128i*) (a_array + i);
// | 65 | 78 | 97 | (BNa)
__m128i slice = _mm_loadu_si128(data);
// | 65 - 64 = 1 | 78 - 64 = 14 | 97 - 64 = 33 | (BNa)
__m128i mask_sub = _mm_sub_epi8(slice,A_mask);
// | 1 < 26 = 1 | 14 < 26 = 1 | 33 < 26 = 0 |
__m128i mask2 = _mm_cmplt_epi8(mask_sub, twenty_six_mask);
// | 1 > -1 = 1 | 14 > -1 = 1 | 33 > -1 = 1 |
__m128i mask3 = _mm_cmpgt_epi8(mask_sub, neg_1);
// | 1 & 1 = 1 | 1 & 1 = 1 | 1 & 0 = 0 |
__m128i mask4 = _mm_and_si128(mask2, mask3);
// | 11111111 & 00010000 = 32 | 11111111 & 00010000 = 32 | 11111111 & 00000000 = 0 | ; 11111111(1 in m128 registers)
__m128i mask_final = _mm_and_si128(mask4, thirty_two);
// | 65 | 32 = 97 | 78 | 32 = 110 | 97 | 0 = 97 | 'A'(65) | 32 (00100001) = 97 (00110001)
__m128i result = _mm_or_si128(slice, mask_final);
_mm_storeu_si128(data, result);
// uint8_t cur_char = a_array[i];
// a_array[i] = cur_char - 'A' < 26 ? cur_char | 0x20 : cur_char;
}
if(remainder > 0) {
__m128i reg_remnant;
load_str_m128i_str_offset(a, total_blocks, &reg_remenant);
// | 65 - 64 = 1 | 78 - 64 = 14 | 97 - 64 = 33 | (BNa)
__m128i mask_sub = _mm_sub_epi8(reg_remenant,A_mask);
// | 1 < 26 = 1 | 14 < 26 = 1 | 33 < 26 = 0 |
__m128i mask2 = _mm_cmplt_epi8(mask_sub, twenty_six_mask);
// | 1 > -1 = 1 | 14 > -1 = 1 | 33 > -1 = 1 |
__m128i mask3 = _mm_cmpgt_epi8(mask_sub, neg_1);
// | 1 & 1 = 1 | 1 & 1 = 1 | 1 & 0 = 0 |
__m128i mask4 = _mm_and_si128(mask2, mask3);
// | 11111111 & 00010000 = 32 | 11111111 & 00100000 = 32 | 11111111 & 00000000 = 0 | ; 11111111(1 in m128 registers)
__m128i mask_final = _mm_and_si128(mask4, thirty_two);
// | 65 | 32 = 97 | 78 | 32 = 110 | 97 | 0 = 97 | 'A'(65) | 32 (00100001) = 97 (00110001)
__m128i result = _mm_or_si128(reg_remenant, mask_final);
union {
__m128i vec;
uint8_t bytes[16];
} u;
u.vec = result;
memcpy(a_array + total_blocks, u.bytes, remainder);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment