Created
February 1, 2019 08:15
-
-
Save kccqzy/0f0aedc3f6789229170601f804ae13a0 to your computer and use it in GitHub Desktop.
Test UTF-8 decoder using PEXT instruction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
unsigned pext_utf8_decode(unsigned char*& buf) { | |
unsigned next4; | |
__builtin_memcpy(&next4, buf, 4); | |
next4 = __builtin_bswap32(next4); | |
if (__builtin_expect(!!(next4 >> 31), 0)) { | |
// multi-byte handling | |
unsigned r; | |
if (((next4 >> 16) & 0b11100000'11000000) == 0b11000000'10000000) { | |
__asm__("pext %1, %2, %0" | |
: "=r"(r) | |
: "r"(0b00011111'00111111'00000000'00000000u), "r"(next4)); | |
buf += 2; | |
return r; | |
} else if (((next4 >> 8) & 0b11110000'11000000'11000000) == | |
0b11100000'10000000'10000000) { | |
__asm__("pext %1, %2, %0" | |
: "=r"(r) | |
: "r"(0b00001111'00111111'00111111'00000000u), "r"(next4)); | |
buf += 3; | |
return r; | |
} else if ((next4 & 0b11111000'11000000'11000000'11000000u) == | |
0b11110000'10000000'10000000'10000000u) { | |
__asm__("pext %1, %2, %0" | |
: "=r"(r) | |
: "r"(0b00000111'00111111'00111111'00111111u), "r"(next4)); | |
buf += 4; | |
return r; | |
} else { | |
abort(); | |
__builtin_unreachable(); | |
} | |
} else { | |
buf++; | |
return next4 >> 24; | |
} | |
} | |
#include <stdio.h> | |
#include <assert.h> | |
int main() { | |
{ | |
unsigned char b1[] = {0xe2, 0x82, 0xac, 0}; | |
unsigned char* buf = b1; | |
assert(pext_utf8_decode(buf) == 0x20ac); | |
assert(buf == b1 + 3); | |
} | |
{ | |
unsigned char b2[] = {'g', 0, 0, 0}; | |
unsigned char* buf = b2; | |
assert(pext_utf8_decode(buf) == (unsigned) 'g'); | |
assert(buf == b2 + 1); | |
} | |
{ | |
unsigned char b3[] = {0xc2, 0xa2, 0, 0}; | |
unsigned char* buf = b3; | |
assert(pext_utf8_decode(buf) == 0xa2); | |
assert(buf == b3 + 2); | |
} | |
{ | |
unsigned char b4[] = {0xe0, 0xa4, 0xb9, 0}; | |
unsigned char* buf = b4; | |
assert(pext_utf8_decode(buf) == 0x939); | |
assert(buf == b4 + 3); | |
} | |
{ | |
unsigned char b5[] = {0xe2, 0x82, 0xac, 0}; | |
unsigned char* buf = b5; | |
assert(pext_utf8_decode(buf) == 0x20ac); | |
assert(buf == b5 + 3); | |
} | |
{ | |
unsigned char b6[] = {0xf0, 0x90, 0x8d, 0x88, 0}; | |
unsigned char* buf = b6; | |
assert(pext_utf8_decode(buf) == 0x10348); | |
assert(buf == b6 + 4); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment