Skip to content

Instantly share code, notes, and snippets.

@oliora
Last active August 3, 2023 10:43
Show Gist options
  • Save oliora/0e76c0185679c494be21d4ee7fa6a58b to your computer and use it in GitHub Desktop.
Save oliora/0e76c0185679c494be21d4ee7fa6a58b to your computer and use it in GitHub Desktop.
Primitive AVX2 strlen implementation in C++
#pragma once
#include "common.h" // https://github.com/oliora/habr-switches-perf-test/blob/main/common.h
#include <cstdint>
#include <string_view>
#include <utility>
#include <immintrin.h>
template <size_t StepSize>
requires((StepSize <= PageSize)
&& (PageSize % StepSize == 0)
&& (StepSize % sizeof(long long) == 0))
inline size_t strlenVectorized(const char *input) noexcept {
constexpr size_t WordsPerStep = StepSize / sizeof(__m256i);
const auto start = input;
// Process unaligned preamble in naive way
{
const auto headEnd = alignedAfter<StepSize>(input);
while (input != headEnd && *input) {
++input;
}
if (input != headEnd) [[unlikely]] {
return input - start;
}
}
const __m256i null_c = _mm256_set1_epi8(0); // Fill all packed 8-bit integers with 0
while (true) {
__m256i eq_null;
forEach<WordsPerStep>([&]<size_t Idx>(IndexConstant<Idx>) {
const __m256i block = _mm256_load_si256(reinterpret_cast<const __m256i*>(input) + Idx);
if constexpr (Idx == 0) {
eq_null = _mm256_cmpeq_epi8(null_c, block);
} else {
eq_null = _mm256_or_si256(eq_null, _mm256_cmpeq_epi8(null_c, block));
}
});
__m128i step_null = _mm_or_si128(_mm256_extracti128_si256(eq_null, 1), *reinterpret_cast<const __m128i*>(&eq_null));
step_null = _mm_or_si128(_mm_bsrli_si128(step_null, 8), step_null);
const auto anyNull = _mm_extract_epi64(step_null, 0);
if (anyNull) [[unlikely]] {
// Found null character, calculate the remaining values char by char until we hit the null character
while (*input) {
++input;
}
return input - start;
}
input += StepSize;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment