Created
August 8, 2012 22:40
-
-
Save apage43/3299487 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <stddef.h> | |
#ifndef __cplusplus | |
#define true 1 | |
#define false 0 | |
#endif | |
int check_utf8(unsigned const char* data, size_t length) { | |
int expect_extend = 0; | |
unsigned const char* end = data + length; | |
for(;data < end; data++) { | |
//Two highest bits set, beginning of a multi-byte sequence. | |
if(*data >= 0xC0) { | |
uint8_t rest = (*data & ~0xC0); | |
ptrdiff_t end_dist = end - data; | |
if(!(rest & 0x20) && end_dist > 1) { | |
if((*(++data) & 0xC0) != 0x80) return false; | |
} else if(!(rest & 0x10) && end_dist > 2) { | |
if((*((uint16_t*)(++data)) & 0xC0C0) != 0x8080) return false; | |
++data; | |
} else if(!(rest & 0x8) && end_dist > 3) { | |
if((*((uint32_t*)(data)) & 0xC0C0C000) != 0x80808000) return false; | |
data += 3; | |
} else { | |
//Invalid multi-byte sequence | |
return false; | |
} | |
} else if(*data > 0x7F) { | |
return false; | |
} | |
} | |
//Should not be expecting more chars at end of string | |
return true; | |
} | |
#define CHECKS(X) do { if(check_utf8((unsigned char*) X, sizeof(X) - 1)) { printf("%s - yes\n", X); } else { printf("no\n"); } } while(0); | |
int main(int argc, char** argv) { | |
CHECKS("normal"); | |
//two-byte char | |
CHECKS("breve'd a: ă"); | |
//three-byte char | |
CHECKS("airplane: ✈"); | |
//four-byte char | |
CHECKS("parenb: 🄑"); | |
//char outside of seq | |
unsigned char invala[] = {128,0}; | |
//unexpected ascii char in mb-seq | |
unsigned char invalb[] = {226,65,0}; | |
//too short at end | |
unsigned char invalc[] = {65,226,0}; | |
//mb-seq inside mb-seq | |
unsigned char invald[] = {226,226,65,67,0}; | |
CHECKS(invala); | |
CHECKS(invalb); | |
CHECKS(invalc); | |
CHECKS(invald); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment