Last active
November 28, 2021 17:00
-
-
Save umezawatakeshi/407b1607b880c5c6a6257d80e61d8353 to your computer and use it in GitHub Desktop.
count UTF-8 codepoint
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <functional> | |
#include <utility> | |
#include <algorithm> | |
#define NOMINMAX | |
#include <Windows.h> | |
#include <intrin.h> | |
#include <cstdlib> | |
const double clock_cycle = 3.4 * 1000 * 1000 * 1000; // Core i7-4770 @3.4GHz | |
class timecounter | |
{ | |
private: | |
LARGE_INTEGER liStart, liStop; | |
static double freq; | |
public: | |
void start() { QueryPerformanceCounter(&liStart); } | |
void stop() { QueryPerformanceCounter(&liStop); } | |
double get() { return (double)(liStop.QuadPart - liStart.QuadPart) / freq; } | |
}; | |
double timecounter::freq = []() { | |
LARGE_INTEGER li; | |
QueryPerformanceFrequency(&li); | |
return (double)li.QuadPart; | |
}(); | |
#define FUNCNAME(fn) { #fn, fn } | |
size_t reference_implementation(const char* mem, size_t sz) | |
{ | |
size_t ret = 0; | |
for (size_t i = 0; i < sz; ++i) | |
if ((mem[i] & 0x80) == 0x00 || (mem[i] & 0xc0) == 0xC0) | |
++ret; | |
return ret; | |
} | |
inline int32_t avx2_horizontal_sum_epi8(__m256i x) | |
{ | |
__m256i sumhi = _mm256_unpackhi_epi8(x, _mm256_setzero_si256()); | |
__m256i sumlo = _mm256_unpacklo_epi8(x, _mm256_setzero_si256()); | |
__m256i sum16x16 = _mm256_add_epi16(sumhi, sumlo); | |
__m256i sum16x8 = _mm256_add_epi16(sum16x16, _mm256_permute2x128_si256(sum16x16, sum16x16, 1)); | |
__m256i sum16x4 = _mm256_add_epi16(sum16x8, _mm256_shuffle_epi32(sum16x8, _MM_SHUFFLE(0, 0, 2, 3))); | |
uint64_t tmp = _mm256_extract_epi64(sum16x4, 0); | |
int32_t result = 0; | |
result += (tmp >> 0) & 0xffff; | |
result += (tmp >> 16) & 0xffff; | |
result += (tmp >> 32) & 0xffff; | |
result += (tmp >> 48) & 0xffff; | |
return result; | |
} | |
#ifdef __AVX512F__ | |
inline int32_t avx512_horizontal_sum_epi8(__m512i x) | |
{ | |
__m512i sumhi = _mm512_unpackhi_epi8(x, _mm512_setzero_si512()); | |
__m512i sumlo = _mm512_unpacklo_epi8(x, _mm512_setzero_si512()); | |
__m512i sum16x32 = _mm512_add_epi16(sumhi, sumlo); | |
__m256i sum16x16 = _mm256_add_epi16(_mm512_castsi512_si256(sum16x32), _mm512_extracti64x4_epi64(sum16x32, 1)); | |
__m128i sum16x8 = _mm_add_epi16(_mm256_castsi256_si128(sum16x16), _mm256_extracti128_si256(sum16x16, 1)); | |
__m128i sum16x4 = _mm_add_epi16(sum16x8, _mm_srli_si128(sum16x8, 8)); | |
uint64_t tmp = _mm_cvtsi128_si64(sum16x4); | |
tmp += (tmp >> 32); | |
tmp += (tmp >> 16); | |
return tmp & 0xffff; | |
} | |
#endif | |
size_t avx_count_utf8_codepoint(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i + 31 < sz;) { | |
__m256i sum = _mm256_setzero_si256(); | |
size_t j = 0; | |
for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) { | |
const __m256i table = _mm256_setr_epi8( | |
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7 | |
0, 0, 0, 0, // 0x8 .. 0xB | |
1, 1, 1, 1, // 0xC .. 0xF | |
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7 | |
0, 0, 0, 0, // 0x8 .. 0xB | |
1, 1, 1, 1 // 0xC .. 0xF | |
); | |
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j)); | |
s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F)); | |
s = _mm256_shuffle_epi8(table, s); | |
sum = _mm256_add_epi8(sum, s); | |
} | |
i += j; | |
result += avx2_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t avx_count_utf8_codepoint_loopend(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m256i sum = _mm256_setzero_si256(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 32, sz - i); | |
for (; j < limit; j += 32) { | |
const __m256i table = _mm256_setr_epi8( | |
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7 | |
0, 0, 0, 0, // 0x8 .. 0xB | |
1, 1, 1, 1, // 0xC .. 0xF | |
1, 1, 1, 1, 1, 1, 1, 1, // .. 0x7 | |
0, 0, 0, 0, // 0x8 .. 0xB | |
1, 1, 1, 1 // 0xC .. 0xF | |
); | |
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j)); | |
s = _mm256_and_si256(_mm256_srli_epi16(s, 4), _mm256_set1_epi8(0x0F)); | |
s = _mm256_shuffle_epi8(table, s); | |
sum = _mm256_add_epi8(sum, s); | |
} | |
i += j; | |
result += avx2_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t opt_innermost_content(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i + 31 < sz;) { | |
__m256i sum = _mm256_setzero_si256(); | |
size_t j = 0; | |
for (; j < 255 * 32 && (i + 31) + j < sz; j += 32) { | |
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j)); | |
sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41))); | |
} | |
i += j; | |
result += avx2_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t opt_innermost_content_loopend(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m256i sum = _mm256_setzero_si256(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 32, sz - i); | |
for (; j < limit; j += 32) { | |
__m256i s = _mm256_load_si256(reinterpret_cast<const __m256i *>(p + i + j)); | |
sum = _mm256_sub_epi8(sum, _mm256_cmpgt_epi8(s, _mm256_set1_epi8(-0x41))); | |
} | |
i += j; | |
result += avx2_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
#ifdef __AVX512F__ | |
size_t avx512_cmpgt_popcnt(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz; i += 64) { | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i)); | |
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41)); | |
result += _mm_popcnt_u64(m); | |
} | |
return result; | |
} | |
size_t avx512_cmpgt_popcnt_nestedloop(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz; ) { | |
size_t sum = 0; | |
size_t j = 0; | |
size_t limit = std::min<size_t>(256 * 64, sz - i); | |
for (; j < limit; j += 64) { | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41)); | |
sum += _mm_popcnt_u64(m); | |
} | |
i += j; | |
result += sum; | |
} | |
return result; | |
} | |
size_t avx512_cmpgt_movm2b_sub(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m512i sum = _mm512_setzero_si512(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 64, sz - i); | |
for (; j < limit; j += 64) { | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41)); | |
sum = _mm512_sub_epi8(sum, _mm512_movm_epi8(m)); | |
} | |
i += j; | |
result += avx512_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t avx512_cmpgt_movdqu8_add(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m512i sum = _mm512_setzero_si512(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 64, sz - i); | |
for (; j < limit; j += 64) { | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41)); | |
sum = _mm512_add_epi8(sum, _mm512_maskz_mov_epi8(m, _mm512_set1_epi8(1))); | |
} | |
i += j; | |
result += avx512_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t avx512_cmpgt_maskadd(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m512i sum = _mm512_setzero_si512(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 64, sz - i); | |
for (; j < limit; j += 64) { | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
__mmask64 m = _mm512_cmpgt_epi8_mask(s, _mm512_set1_epi8(-0x41)); | |
sum = _mm512_mask_add_epi8(sum, m, sum, _mm512_set1_epi8(1)); | |
} | |
i += j; | |
result += avx512_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
size_t avx512_cmpgt_maskadd128(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m512i sum0 = _mm512_setzero_si512(); | |
__m512i sum1 = _mm512_setzero_si512(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 128, sz - i); | |
for (; j < limit; j += 128) { | |
__m512i s0 = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
__m512i s1 = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j + 64)); | |
__mmask64 m0 = _mm512_cmpgt_epi8_mask(s0, _mm512_set1_epi8(-0x41)); | |
__mmask64 m1 = _mm512_cmpgt_epi8_mask(s1, _mm512_set1_epi8(-0x41)); | |
sum0 = _mm512_mask_add_epi8(sum0, m0, sum0, _mm512_set1_epi8(1)); | |
sum1 = _mm512_mask_add_epi8(sum1, m1, sum1, _mm512_set1_epi8(1)); | |
} | |
i += j; | |
result += avx512_horizontal_sum_epi8(sum0); | |
result += avx512_horizontal_sum_epi8(sum1); | |
} | |
return result; | |
} | |
size_t avx512_pshufb_add(const char *p, size_t sz) | |
{ | |
size_t result = 0; | |
for (size_t i = 0; i < sz;) { | |
__m512i sum = _mm512_setzero_si512(); | |
size_t j = 0; | |
size_t limit = std::min<size_t>(255 * 64, sz - i); | |
for (; j < limit; j += 64) { | |
const __m512i table = _mm512_set_epi8( | |
1, 1, 1, 1, // 0xF .. 0xC | |
0, 0, 0, 0, // 0xB .. 0x8 | |
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 .. | |
1, 1, 1, 1, // 0xF .. 0xC | |
0, 0, 0, 0, // 0xB .. 0x8 | |
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 .. | |
1, 1, 1, 1, // 0xF .. 0xC | |
0, 0, 0, 0, // 0xB .. 0x8 | |
1, 1, 1, 1, 1, 1, 1, 1, // 0x7 .. | |
1, 1, 1, 1, // 0xF .. 0xC | |
0, 0, 0, 0, // 0xB .. 0x8 | |
1, 1, 1, 1, 1, 1, 1, 1 // 0x7 .. | |
); | |
__m512i s = _mm512_load_si512(reinterpret_cast<const __m512i *>(p + i + j)); | |
s = _mm512_and_si512(_mm512_srli_epi16(s, 4), _mm512_set1_epi8(0x0F)); | |
s = _mm512_shuffle_epi8(table, s); | |
sum = _mm512_add_epi8(sum, s); | |
} | |
i += j; | |
result += avx512_horizontal_sum_epi8(sum); | |
} | |
return result; | |
} | |
#endif | |
const std::pair<size_t, int> sizereps[] = { | |
{ 32 * 256 * 2, 1 }, | |
#ifndef _DEBUG | |
// { 32 * 255 * 2, 10000000 }, | |
{ 32 * 256 * 2, 10000000 }, | |
// { 32 * 255 * 28, 1000000 }, | |
{ 32 * 256 * 28, 1000000 }, | |
// { 32 * 255 * 768, 40000 }, | |
{ 32 * 256 * 768, 40000 }, | |
// { 32 * 255 * 16384, 1000 }, | |
{ 32 * 256 * 16384, 1000 }, | |
#endif | |
}; | |
/* const をつけると clang の最適化がすごい ※1 */ std::pair<const char*, size_t(*)(const char*, size_t)> funcs[] = { | |
// FUNCNAME(reference_implementation), | |
FUNCNAME(avx_count_utf8_codepoint), | |
FUNCNAME(avx_count_utf8_codepoint_loopend), | |
FUNCNAME(opt_innermost_content), | |
FUNCNAME(opt_innermost_content_loopend), | |
#ifdef __AVX512F__ | |
FUNCNAME(avx512_cmpgt_popcnt), | |
FUNCNAME(avx512_cmpgt_popcnt_nestedloop), | |
FUNCNAME(avx512_cmpgt_movm2b_sub), | |
FUNCNAME(avx512_cmpgt_movdqu8_add), | |
FUNCNAME(avx512_cmpgt_maskadd), | |
FUNCNAME(avx512_cmpgt_maskadd128), | |
FUNCNAME(avx512_pshufb_add), | |
#endif | |
}; | |
int main() | |
{ | |
for (const auto [sz, rep] : sizereps) | |
{ | |
printf("size %zu loop %d\n", sz, rep); | |
char* mem = (char*)_aligned_malloc(sz, 64); | |
for (size_t i = 0; i < sz; ++i) | |
mem[i] = rand(); | |
const size_t ncodepoint = reference_implementation(mem, sz); | |
for (const auto[name, fn] : funcs) | |
{ | |
timecounter tc; | |
size_t r; | |
printf(" %-32s: ", name); | |
tc.start(); | |
for (int i = 0; i < rep; ++i) | |
r = fn(mem, sz); // ※1 のところで const を付けると clang がここの呼び出しが純粋関数になることを利用してループせずに1回だけの実行になってしまう | |
tc.stop(); | |
if (r != ncodepoint) | |
{ | |
printf("wrong answer!\n"); | |
return 1; | |
} | |
printf("%10.3fus %5.1fGB/s %5.2fclk/32B\n", | |
tc.get() / rep * (1000 * 1000), | |
((double)sz) * rep / tc.get() / (1000 * 1000 * 1000), | |
tc.get() * clock_cycle / sz / rep * 32); | |
} | |
_aligned_free(mem); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
looks like shit