Skip to content

Instantly share code, notes, and snippets.

@apage43
Created August 8, 2012 21:42
Show Gist options
  • Save apage43/3299039 to your computer and use it in GitHub Desktop.
Save apage43/3299039 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#ifndef __cplusplus
#define true 1
#define false 0
#endif
int check_utf8(unsigned const char* data, size_t length) {
int expect_extend = 0;
unsigned const char* end = data + length;
for(;data < end; data++) {
if(*data <= 0x7F)
{
if(expect_extend != 0) return false;
continue;
}
//Two highest bits set, beginning of a multi-byte sequence.
if((*data & 0xC0) == 0xC0) {
if(expect_extend != 0) {
//Beginning of multi-byte sequence inside of another
//multi-byte sequence
return false;
}
expect_extend++;
if(*data & 0x20) expect_extend++;
if((*data & 0x10) && expect_extend == 2) expect_extend++;
//Verify zero bit separates signal and codepoint bits
//The one-and-two bit cases would've been checked above
//if(expect_extend == 1 && (*data & 0x20)) return false;
//if(expect_extend == 2 && (*data & 0x10)) return false;
if(expect_extend == 3 && (*data & 0x8)) return false;
continue;
}
//High bit is set, second highest is not
if(expect_extend) {
expect_extend--;
} else {
return false;
}
}
//Should not be expecting more chars at end of string
return (expect_extend == 0);
}
#define CHECKS(X) do { if(check_utf8((unsigned char*) X, sizeof(X) - 1)) { printf("%s - yes\n", X); } else { printf("no\n"); } } while(0);
int main(int argc, char** argv) {
CHECKS("normal");
//two-byte char
CHECKS("breve'd a: ă");
//three-byte char
CHECKS("airplane: ✈");
//four-byte char
CHECKS("parenb: 🄑");
//char outside of seq
unsigned char invala[] = {128,0};
//unexpected ascii char in mb-seq
unsigned char invalb[] = {226,65,0};
//too short at end
unsigned char invalc[] = {65,226,0};
//mb-seq inside mb-seq
unsigned char invald[] = {226,226,65,67,0};
CHECKS(invala);
CHECKS(invalb);
CHECKS(invalc);
CHECKS(invald);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment