Created
March 23, 2017 07:42
-
-
Save sunfishcode/c050d4f60633c49ae6e54a3d45385031 to your computer and use it in GitHub Desktop.
"UTF-8 decode without BOM or fail" implementation in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stddef.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
bool is_valid_utf8(const uint8_t *str, size_t len) | |
{ | |
size_t bytes_seen = 0; | |
size_t bytes_needed = 0; | |
uint8_t lower_boundary = 0x80; | |
uint8_t upper_boundary = 0xbf; | |
const uint8_t *end = str + len; | |
while (str != end) { | |
uint8_t byte = *str++; | |
if (bytes_needed == 0) { | |
if (byte < 0x80) | |
continue; | |
if (byte < 0xc2) | |
return false; | |
if (byte < 0xe0) { | |
bytes_needed = 1; | |
continue; | |
} | |
if (byte < 0xf0) { | |
if (byte == 0xe0) | |
lower_boundary = 0xa0; | |
else if (byte == 0xed) | |
upper_boundary = 0x9f; | |
bytes_needed = 2; | |
continue; | |
} | |
if (byte < 0xf5) { | |
if (byte == 0xf0) | |
lower_boundary = 0x90; | |
else if (byte == 0xf4) | |
upper_boundary = 0x8f; | |
bytes_needed = 3; | |
continue; | |
} | |
return false; | |
} | |
if (byte < lower_boundary || byte > upper_boundary) | |
return false; | |
lower_boundary = 0x80; | |
upper_boundary = 0xbf; | |
++bytes_seen; | |
if (bytes_seen == bytes_needed) { | |
bytes_needed = 0; | |
bytes_seen = 0; | |
} | |
} | |
return bytes_needed == 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment