Last active
July 25, 2022 16:47
-
-
Save ske2004/495464204f3db8f34d01e6b2ee92a2bb to your computer and use it in GitHub Desktop.
My first attempt at SIMD :P
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <emmintrin.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdbool.h> | |
#include <immintrin.h> | |
#include <time.h> | |
#define Benchmark(name, times) for (int t = 1; t;) for (double final; t;) for(clock_t start = clock(); t; final = (double)(clock()-start) / CLOCKS_PER_SEC, printf(" <%s> %gs / %dtimes = %gs\n", name, final, times, final/times), t = 0) for (int i = 0; i < times; ++i) | |
#define Utf8_BYTES_PER_ASCII 32 | |
typedef int Rune; | |
typedef Rune Utf8_RuneDestination[Utf8_BYTES_PER_ASCII]; | |
typedef Rune Utf8_RuneDestinationSSE2[16]; | |
struct Utf8_Streaming { | |
const char *string; | |
size_t string_len; | |
} | |
typedef Utf8_Streaming; | |
Utf8_Streaming Utf8_begin_stream(const char *string) { | |
return (Utf8_Streaming) { | |
.string = string, | |
.string_len = strlen(string) | |
}; | |
} | |
void Utf8_printpack_base(int *pack, int max) { | |
for (int i = 0; i < max; ++i) { | |
printf("%08x ", pack[i]); | |
} | |
printf("\n"); | |
} | |
void Utf8_printpack_32x8(__m256i p) { | |
int pack[8]; | |
_mm256_storeu_si256((__m256i_u*)pack, p); | |
Utf8_printpack_base(pack, 8); | |
} | |
void Utf8_printpack_32x4(__m128i p) { | |
int pack[4]; | |
_mm_storeu_si128((__m128i_u*)pack, p); | |
Utf8_printpack_base(pack, 4); | |
} | |
void Utf8_printpack_8x16(__m128i p) { | |
char spack[16]; | |
int dpack[16]; | |
_mm_storeu_si128((__m128i_u*)spack, p); | |
for (int i = 0; i < 16; ++i) dpack[i] = spack[i]; | |
Utf8_printpack_base(dpack, 16); | |
} | |
static inline | |
void Utf8_decode_ascii_sse2(Utf8_Streaming *stream, Utf8_RuneDestinationSSE2 dest) { | |
if (stream->string_len >= 16) { | |
__m128i data = _mm_loadu_si128((const __m128i_u*)stream->string); | |
__m128i lo = _mm_unpacklo_epi8(data, _mm_set1_epi32(0)); | |
__m128i hi = _mm_unpackhi_epi8(data, _mm_set1_epi32(0)); | |
// unpack 8x16 acii chars into 32x16 destination | |
_mm_storeu_si128((__m128i_u*)dest+0, _mm_unpacklo_epi16(lo, _mm_set1_epi32(0))); | |
_mm_storeu_si128((__m128i_u*)dest+1, _mm_unpackhi_epi16(lo, _mm_set1_epi32(0))); | |
_mm_storeu_si128((__m128i_u*)dest+2, _mm_unpacklo_epi16(hi, _mm_set1_epi32(0))); | |
_mm_storeu_si128((__m128i_u*)dest+3, _mm_unpackhi_epi16(hi, _mm_set1_epi32(0))); | |
stream->string_len -= 16; | |
stream->string += 16; | |
} else { | |
// eh | |
for (size_t i = 0; i < stream->string_len; ++i) { | |
dest[i] = stream->string[i]; | |
} | |
stream->string += stream->string_len; | |
dest[stream->string_len] = 0; | |
stream->string_len = 0; | |
} | |
} | |
static inline | |
void Utf8_decode_ascii_naive(Utf8_Streaming *stream, Utf8_RuneDestination dest) { | |
// eh | |
size_t i; | |
for (i = 0; i < stream->string_len && i < 32; ++i) { | |
dest[i] = stream->string[i]; | |
} | |
if (i < 32) { | |
dest[i] = 0; | |
} | |
stream->string += i; | |
stream->string_len -= i; | |
} | |
static inline | |
void Utf8_decode_ascii(Utf8_Streaming *stream, Utf8_RuneDestination dest) { | |
if (stream->string_len >= Utf8_BYTES_PER_ASCII) { | |
__m256i data = _mm256_loadu_si256((const __m256i_u*)stream->string); | |
__m256i lo = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 0)); | |
__m256i hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(data, 1)); | |
__m256i e1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 0)); | |
__m256i e2 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(lo, 1)); | |
__m256i e3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 0)); | |
__m256i e4 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(hi, 1)); | |
_mm256_storeu_si256((__m256i_u*)dest+0, e1); | |
_mm256_storeu_si256((__m256i_u*)dest+1, e2); | |
_mm256_storeu_si256((__m256i_u*)dest+2, e3); | |
_mm256_storeu_si256((__m256i_u*)dest+3, e4); | |
stream->string_len -= Utf8_BYTES_PER_ASCII; | |
stream->string += Utf8_BYTES_PER_ASCII; | |
} else { | |
// eh | |
for (size_t i = 0; i < stream->string_len; ++i) { | |
dest[i] = stream->string[i]; | |
} | |
stream->string += stream->string_len; | |
dest[stream->string_len] = 0; | |
stream->string_len = 0; | |
} | |
} | |
bool Utf8_stream_running(Utf8_Streaming *stream) { | |
return stream->string_len; | |
} | |
char *read_file(const char *path) { | |
FILE *f = fopen(path, "r"); | |
fseek(f, 0, SEEK_END); | |
size_t size = ftell(f); | |
fseek(f, 0, SEEK_SET); | |
char *s = malloc(size + 1); | |
s[fread(s, 1, size, f)] = 0; | |
return s; | |
} | |
int main() { | |
char *file = read_file("TestFile.txt"); | |
Utf8_Streaming streamA = Utf8_begin_stream(file); | |
Utf8_Streaming streamB = Utf8_begin_stream(file); | |
printf("String length: %zu\n", streamA.string_len); | |
Benchmark("BLANK", 10000) {} | |
Benchmark("SSE2", 10000) { | |
Utf8_Streaming stream = streamA; | |
while (Utf8_stream_running(&stream)) { | |
Utf8_RuneDestinationSSE2 dest; | |
Utf8_decode_ascii_sse2(&stream, dest); | |
} | |
} | |
Benchmark("NAIVE", 10000) { | |
Utf8_Streaming stream = streamA; | |
while (Utf8_stream_running(&stream)) { | |
Utf8_RuneDestination dest; | |
Utf8_decode_ascii_naive(&stream, dest); | |
} | |
} | |
Benchmark("AVX2", 10000) { | |
Utf8_Streaming stream = streamA; | |
while (Utf8_stream_running(&stream)) { | |
Utf8_RuneDestination dest; | |
Utf8_decode_ascii(&stream, dest); | |
} | |
} | |
free(file); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment