Last active
January 19, 2026 13:04
-
-
Save DigitalRedPanda/ef23322ada0fe8abd40907e21a0bc194 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define DEBUG | |
| #ifndef DEBUG | |
| #define STATIC_INLINE static inline | |
| #else | |
| #define STATIC_INLINE | |
| #endif | |
| typedef struct { | |
| uint64_t length, capacity; | |
| uint8_t *data; | |
| } string; | |
| STATIC_INLINE string *string_new(const char *src, size_t len) { | |
| string *s = malloc(sizeof(string)); | |
| s->length = len; | |
| s->capacity = len + 1; | |
| s->data = malloc(len + 1); | |
| memcpy(s->data, src, len); | |
| s->data[len] = '\0'; | |
| return s; | |
| } | |
| STATIC_INLINE string *string_empty(size_t len) { | |
| string *s = malloc(sizeof(string)); | |
| s->length = len; | |
| s->capacity = len + 1; | |
| s->data = calloc(len + 1, 1); | |
| return s; | |
| } | |
| STATIC_INLINE void string_free(string *a) { | |
| if(a){ | |
| free(a->data); | |
| free(a); | |
| } | |
| } | |
| STATIC_INLINE void load_str_m128i_str(const string *str, __m128i *reg) { | |
| char padded_string[16] __attribute__((aligned(16))) = {0}; | |
| size_t size_c = str->length; | |
| size_t s_c = size_c > 16 ? 16 : size_c; | |
| memcpy(padded_string, str->data, s_c); | |
| *reg = _mm_loadu_si128((__m128i*) padded_string); | |
| } | |
| STATIC_INLINE void load_str_m128i_str_offset(const string *str, size_t offset, __m128i *reg) { | |
| char padded_string[16] __attribute__((aligned(16))) = {0}; | |
| size_t size_c = str->length - offset; | |
| size_t s_c = size_c > 16 ? 16 : size_c; | |
| memcpy(padded_string, str->data + offset, s_c); | |
| *reg = _mm_loadu_si128((__m128i*) padded_string); | |
| } | |
| int contains_str(string *a, string *b) { | |
| size_t size_a = a->length; | |
| size_t size_b = b->length; | |
| uint8_t *a_array = a->data; | |
| __m128i reg_b; | |
| load_str_m128i_str(b, ®_b); | |
| for (int i = 0; i <= size_a - size_b; i++) { | |
| __m128i *data = (__m128i*) (a_array + i); | |
| __m128i slice = _mm_loadu_si128(data); | |
| int flags = _mm_cmpestrc(slice, size_b, reg_b, 16, _SIDD_CMP_EQUAL_ORDERED); | |
| if(flags & 1) return 1; | |
| } | |
| return 0; | |
| } | |
| STATIC_INLINE void lowercase(string *a) { | |
| int a_len = a->length; | |
| uint8_t* a_array = a->data; | |
| __m128i A_mask = _mm_set1_epi8('A'); | |
| __m128i thirty_two = _mm_set1_epi8(32); | |
| __m128i twenty_six_mask = _mm_set1_epi8(26); | |
| __m128i neg_1 = _mm_set1_epi8(-1); | |
| size_t blocks = a->length / 16; | |
| size_t remainder = a->length % 16; | |
| size_t total_blocks = blocks * 16; | |
| for (int i = 0; i < total_blocks; i += 0x10) { | |
| __m128i *data = (__m128i*) (a_array + i); | |
| // | 65 | 78 | 97 | (BNa) | |
| __m128i slice = _mm_loadu_si128(data); | |
| // | 65 - 64 = 1 | 78 - 64 = 14 | 97 - 64 = 33 | (BNa) | |
| __m128i mask_sub = _mm_sub_epi8(slice,A_mask); | |
| // | 1 < 26 = 1 | 14 < 26 = 1 | 33 < 26 = 0 | | |
| __m128i mask2 = _mm_cmplt_epi8(mask_sub, twenty_six_mask); | |
| // | 1 > -1 = 1 | 14 > -1 = 1 | 33 > -1 = 1 | | |
| __m128i mask3 = _mm_cmpgt_epi8(mask_sub, neg_1); | |
| // | 1 & 1 = 1 | 1 & 1 = 1 | 1 & 0 = 0 | | |
| __m128i mask4 = _mm_and_si128(mask2, mask3); | |
| // | 11111111 & 00010000 = 32 | 11111111 & 00010000 = 32 | 11111111 & 00000000 = 0 | ; 11111111(1 in m128 registers) | |
| __m128i mask_final = _mm_and_si128(mask4, thirty_two); | |
| // | 65 | 32 = 97 | 78 | 32 = 110 | 97 | 0 = 97 | 'A'(65) | 32 (00100001) = 97 (00110001) | |
| __m128i result = _mm_or_si128(slice, mask_final); | |
| _mm_storeu_si128(data, result); | |
| // uint8_t cur_char = a_array[i]; | |
| // a_array[i] = cur_char - 'A' < 26 ? cur_char | 0x20 : cur_char; | |
| } | |
| if(remainder > 0) { | |
| __m128i reg_remnant; | |
| load_str_m128i_str_offset(a, total_blocks, ®_remenant); | |
| // | 65 - 64 = 1 | 78 - 64 = 14 | 97 - 64 = 33 | (BNa) | |
| __m128i mask_sub = _mm_sub_epi8(reg_remenant,A_mask); | |
| // | 1 < 26 = 1 | 14 < 26 = 1 | 33 < 26 = 0 | | |
| __m128i mask2 = _mm_cmplt_epi8(mask_sub, twenty_six_mask); | |
| // | 1 > -1 = 1 | 14 > -1 = 1 | 33 > -1 = 1 | | |
| __m128i mask3 = _mm_cmpgt_epi8(mask_sub, neg_1); | |
| // | 1 & 1 = 1 | 1 & 1 = 1 | 1 & 0 = 0 | | |
| __m128i mask4 = _mm_and_si128(mask2, mask3); | |
| // | 11111111 & 00010000 = 32 | 11111111 & 00100000 = 32 | 11111111 & 00000000 = 0 | ; 11111111(1 in m128 registers) | |
| __m128i mask_final = _mm_and_si128(mask4, thirty_two); | |
| // | 65 | 32 = 97 | 78 | 32 = 110 | 97 | 0 = 97 | 'A'(65) | 32 (00100001) = 97 (00110001) | |
| __m128i result = _mm_or_si128(reg_remenant, mask_final); | |
| union { | |
| __m128i vec; | |
| uint8_t bytes[16]; | |
| } u; | |
| u.vec = result; | |
| memcpy(a_array + total_blocks, u.bytes, remainder); | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment