Skip to content

Instantly share code, notes, and snippets.

@CAFxX
Last active October 24, 2024 08:25
Show Gist options
  • Save CAFxX/0a5cad9df679b1d8b4b8c3357fc6d559 to your computer and use it in GitHub Desktop.
Save CAFxX/0a5cad9df679b1d8b4b8c3357fc6d559 to your computer and use it in GitHub Desktop.
memchrs
// https://godbolt.org/z/63Ebd37vz
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
void* memchrs(const void* haystack, int len, const char* needles, int n) {
if (len <= 0 || n <= 0) {
return NULL;
}
__m512i n30, n29, n28, n27, n26, n25, n24, n23, n22, n21, n20, n19, n18, n17, n16, n15, n14, n13, n12, n11, n10, n9, n8, n7, n6, n5, n4, n3, n2, n1, n0;
switch (n-1<=30?n-1:29) {
case 30: n30 = _mm512_set1_epi8(needles[30]);
case 29: n29 = _mm512_set1_epi8(needles[29]);
case 28: n28 = _mm512_set1_epi8(needles[28]);
case 27: n27 = _mm512_set1_epi8(needles[27]);
case 26: n26 = _mm512_set1_epi8(needles[26]);
case 25: n25 = _mm512_set1_epi8(needles[25]);
case 24: n24 = _mm512_set1_epi8(needles[24]);
case 23: n23 = _mm512_set1_epi8(needles[23]);
case 22: n22 = _mm512_set1_epi8(needles[22]);
case 21: n21 = _mm512_set1_epi8(needles[21]);
case 20: n20 = _mm512_set1_epi8(needles[20]);
case 19: n19 = _mm512_set1_epi8(needles[19]);
case 18: n18 = _mm512_set1_epi8(needles[18]);
case 17: n17 = _mm512_set1_epi8(needles[17]);
case 16: n16 = _mm512_set1_epi8(needles[16]);
case 15: n15 = _mm512_set1_epi8(needles[15]);
case 14: n14 = _mm512_set1_epi8(needles[14]);
case 13: n13 = _mm512_set1_epi8(needles[13]);
case 12: n12 = _mm512_set1_epi8(needles[12]);
case 11: n11 = _mm512_set1_epi8(needles[11]);
case 10: n10 = _mm512_set1_epi8(needles[10]);
case 9: n9 = _mm512_set1_epi8(needles[9]);
case 8: n8 = _mm512_set1_epi8(needles[8]);
case 7: n7 = _mm512_set1_epi8(needles[7]);
case 6: n6 = _mm512_set1_epi8(needles[6]);
case 5: n5 = _mm512_set1_epi8(needles[5]);
case 4: n4 = _mm512_set1_epi8(needles[4]);
case 3: n3 = _mm512_set1_epi8(needles[3]);
case 2: n2 = _mm512_set1_epi8(needles[2]);
case 1: n1 = _mm512_set1_epi8(needles[1]);
n0 = _mm512_set1_epi8(needles[0]);
break;
case 0: return memchr((char*)haystack, needles[0], len);
}
static const void* dispatch_table[] = { &&case_0, &&case_1, &&case_2, &&case_3, &&case_4, &&case_5, &&case_6, &&case_7, &&case_8, &&case_9, &&case_10, &&case_11, &&case_12, &&case_13, &&case_14, &&case_15, &&case_16, &&case_17, &&case_18, &&case_19, &&case_20, &&case_21, &&case_22, &&case_23, &&case_24, &&case_25, &&case_26, &&case_27, &&case_28, &&case_29, &&case_30, &&case_30_or_more };
const void* target = dispatch_table[n-1<31?n-1:31];
__m512i h;
__mmask64 m;
__mmask64 tm;
int i = 0;
loop_start:
if (i<len) {
tm = _cvtu64_mask64((((uint64_t)1)<<(len-i))-1);
h = _mm512_maskz_loadu_epi8(tm, (char*)haystack+i);
m = 0;
goto *target;
}
return NULL;
case_30_or_more:
#pragma unroll(4)
for (int j=30; j<n; j++) {
n30 = _mm512_set1_epi8(needles[j]);
m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n30));
}
goto case_29;
case_30: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n30));
case_29: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n29));
case_28: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n28));
case_27: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n27));
case_26: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n26));
case_25: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n25));
case_24: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n24));
case_23: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n23));
case_22: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n22));
case_21: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n21));
case_20: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n20));
case_19: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n19));
case_18: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n18));
case_17: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n17));
case_16: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n16));
case_15: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n15));
case_14: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n14));
case_13: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n13));
case_12: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n12));
case_11: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n11));
case_10: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n10));
case_9: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n9));
case_8: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n8));
case_7: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n7));
case_6: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n6));
case_5: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n5));
case_4: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n4));
case_3: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n3));
case_2: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n2));
case_1: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n1));
case_0: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n0));
if (m != 0) {
return (char*)haystack + i + __builtin_ctz(m);
}
i += 64;
goto loop_start;
return NULL;
}
void memchrs_cb(const void* haystack, int len, const char* needles, int n, void(*fn)(const void*, int len)) {
loop:
void* res = memchrs(haystack, len, needles, n);
if (!res)
return;
fn(res, len);
goto loop;
}
__m512i <%= 30.downto(0).map {|j| "n#{j}" }.join(", ") %>;
switch (n-1<=30?n-1:29) {
<% 30.downto(0) {|j| %>
case <%= j %>: n<%= j %> = _mm512_set1_epi8(needles[<%= j %>]);
<% } %>
}
static const void* dispatch_table[] = { <%= (0..30).map {|j| "&&case_#{j}" }.join(", ") %>, &&case_30_or_more };
const void* target = dispatch_table[n-1<31?n-1:31];
__m512i h;
__mmask64 m;
__mmask64 tm;
int i = 0;
loop_start:
if (i<len) {
tm = _cvtu64_mask64((((uint64_t)1)<<(len-i))-1);
h = _mm512_maskz_loadu_epi8(tm, (char*)haystack+i);
m = 0;
goto *target;
}
return NULL;
case_30_or_more:
for (int j=30; j<n; j++) {
n30 = _mm512_set1_epi8(needles[j]);
m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n30));
}
goto case_29;
<% 30.downto(0) {|j| %>
case_<%= j %>: m = _kor_mask64(m, _mm512_mask_cmpeq_epi8_mask(tm, h, n<%= j %>));
<% } %>
if (m != 0) {
return (char*)haystack + i + __builtin_ctz(m);
}
i += 64;
goto loop_start;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment