Skip to content

Instantly share code, notes, and snippets.

@apage43
Created August 8, 2012 22:40
Show Gist options
  • Save apage43/3299487 to your computer and use it in GitHub Desktop.
Save apage43/3299487 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#ifndef __cplusplus
#define true 1
#define false 0
#endif
int check_utf8(unsigned const char* data, size_t length) {
int expect_extend = 0;
unsigned const char* end = data + length;
for(;data < end; data++) {
//Two highest bits set, beginning of a multi-byte sequence.
if(*data >= 0xC0) {
uint8_t rest = (*data & ~0xC0);
ptrdiff_t end_dist = end - data;
if(!(rest & 0x20) && end_dist > 1) {
if((*(++data) & 0xC0) != 0x80) return false;
} else if(!(rest & 0x10) && end_dist > 2) {
if((*((uint16_t*)(++data)) & 0xC0C0) != 0x8080) return false;
++data;
} else if(!(rest & 0x8) && end_dist > 3) {
if((*((uint32_t*)(data)) & 0xC0C0C000) != 0x80808000) return false;
data += 3;
} else {
//Invalid multi-byte sequence
return false;
}
} else if(*data > 0x7F) {
return false;
}
}
//Should not be expecting more chars at end of string
return true;
}
#define CHECKS(X) do { if(check_utf8((unsigned char*) X, sizeof(X) - 1)) { printf("%s - yes\n", X); } else { printf("no\n"); } } while(0);
int main(int argc, char** argv) {
CHECKS("normal");
//two-byte char
CHECKS("breve'd a: ă");
//three-byte char
CHECKS("airplane: ✈");
//four-byte char
CHECKS("parenb: 🄑");
//char outside of seq
unsigned char invala[] = {128,0};
//unexpected ascii char in mb-seq
unsigned char invalb[] = {226,65,0};
//too short at end
unsigned char invalc[] = {65,226,0};
//mb-seq inside mb-seq
unsigned char invald[] = {226,226,65,67,0};
CHECKS(invala);
CHECKS(invalb);
CHECKS(invalc);
CHECKS(invald);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment