Created
August 8, 2012 21:42
-
-
Save apage43/3299039 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#ifndef __cplusplus | |
#define true 1 | |
#define false 0 | |
#endif | |
int check_utf8(unsigned const char* data, size_t length) { | |
int expect_extend = 0; | |
unsigned const char* end = data + length; | |
for(;data < end; data++) { | |
if(*data <= 0x7F) | |
{ | |
if(expect_extend != 0) return false; | |
continue; | |
} | |
//Two highest bits set, beginning of a multi-byte sequence. | |
if((*data & 0xC0) == 0xC0) { | |
if(expect_extend != 0) { | |
//Beginning of multi-byte sequence inside of another | |
//multi-byte sequence | |
return false; | |
} | |
expect_extend++; | |
if(*data & 0x20) expect_extend++; | |
if((*data & 0x10) && expect_extend == 2) expect_extend++; | |
//Verify zero bit separates signal and codepoint bits | |
//The one-and-two bit cases would've been checked above | |
//if(expect_extend == 1 && (*data & 0x20)) return false; | |
//if(expect_extend == 2 && (*data & 0x10)) return false; | |
if(expect_extend == 3 && (*data & 0x8)) return false; | |
continue; | |
} | |
//High bit is set, second highest is not | |
if(expect_extend) { | |
expect_extend--; | |
} else { | |
return false; | |
} | |
} | |
//Should not be expecting more chars at end of string | |
return (expect_extend == 0); | |
} | |
#define CHECKS(X) do { if(check_utf8((unsigned char*) X, sizeof(X) - 1)) { printf("%s - yes\n", X); } else { printf("no\n"); } } while(0); | |
int main(int argc, char** argv) { | |
CHECKS("normal"); | |
//two-byte char | |
CHECKS("breve'd a: ă"); | |
//three-byte char | |
CHECKS("airplane: ✈"); | |
//four-byte char | |
CHECKS("parenb: 🄑"); | |
//char outside of seq | |
unsigned char invala[] = {128,0}; | |
//unexpected ascii char in mb-seq | |
unsigned char invalb[] = {226,65,0}; | |
//too short at end | |
unsigned char invalc[] = {65,226,0}; | |
//mb-seq inside mb-seq | |
unsigned char invald[] = {226,226,65,67,0}; | |
CHECKS(invala); | |
CHECKS(invalb); | |
CHECKS(invalc); | |
CHECKS(invald); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment