Last active
September 28, 2024 23:18
-
-
Save powturbo/2b06a84b6008dfffef11e53edba297d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
size_t memcount_avx2(const void *s, int c, size_t n) { | |
__m256i cv = _mm256_set1_epi8(c), zv = _mm256_setzero_si256(), sum = zv, acr0,acr1,acr2,acr3; | |
const char *p,*pe; | |
for(p = s; p != (char *)s+(n- (n % (252*32)));) { | |
for(acr0 = acr1 = acr2 = acr3 = zv,pe = p+252*32; p != pe; p += 128) { | |
acr0 = _mm256_add_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p))); | |
acr1 = _mm256_add_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+32)))); | |
acr2 = _mm256_add_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+64)))); | |
acr3 = _mm256_add_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+96)))); __builtin_prefetch(p+1024); | |
} | |
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(_mm256_sub_epi8(zv, acr0), zv)); | |
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(_mm256_sub_epi8(zv, acr1), zv)); | |
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(_mm256_sub_epi8(zv, acr2), zv)); | |
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(_mm256_sub_epi8(zv, acr3), zv)); | |
} | |
for(acr0=zv; p+32 < (char *)s + n; p += 32) | |
acr0 = _mm256_add_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p))); | |
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(_mm256_sub_epi8(zv, acr0), zv)); | |
size_t count = _mm256_extract_epi64(sum, 0) + _mm256_extract_epi64(sum, 1) + _mm256_extract_epi64(sum, 2) + _mm256_extract_epi64(sum, 3); | |
while(p != (char *)s + n) count += *p++ == c; | |
return count; | |
} |
Okay some further investigation, unrolling the loop to 3 or 5 is better than 4 because then you can set the limit to 255 * 32 and still use a single acr
v32_i8: ff ff ff ff | ff ff ff ff | ff ff ff ff | ff ff ff ff | ff ff ff ff | ff ff ff ff | ff ff ff ff | ff ff ff ff <- acr0
v4_i64: 14280 14280 14280 14280 | 37c8 37c8 37c8 37c8 <- sum
v32_i8: 1 1 1 1 | 1 1 1 1 | 1 1 1 1 | 1 1 1 1 | 1 1 1 1 | 1 1 1 1 | 1 1 1 1 | 1 1 1 1 <- back to the outer loop
Actually I see it's probably better to not have a dependency on a single acr and doing stuff out of order instead
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey @powturbo have you checked whether acr's actually get full/reach 127 in this scenario, from all my testing it only got up to 3f/63, could you only use acr0 and acr1 like
and then only do 2 sad_epu8 calls.
I'm unable to verify if it's faster because even when the data is in cache I'm not hitting 100% core utilization so it waits for memory/cache
I'm using your variant from SO https://stackoverflow.com/a/57929966/23017301
There's a possibility I'm missing something which is why im asking you :P