Skip to content

Instantly share code, notes, and snippets.

@sacko87
Last active August 29, 2015 13:57
Show Gist options
  • Save sacko87/9455014 to your computer and use it in GitHub Desktop.
Save sacko87/9455014 to your computer and use it in GitHub Desktop.
#include "utf8.h"
#include <errno.h>
uint8_t
isutf8(const unsigned char *string)
{
int nbytes;
unsigned char *ptr = (unsigned char *) string;
while(*ptr) {
nbytes = 1;
#define SEQ(shift, result, increment) \
if(((*ptr) >> (shift)) == (result)) {\
while(nbytes < (increment)) {\
if((*(ptr + nbytes) >> 6) != 2) {\
return 0;\
}\
nbytes++;\
}\
ptr += (increment);\
continue;\
}
SEQ(7, 0x00, 1) /* U+0000 U+007F */
SEQ(5, 0x06, 2) /* U+0080 U+07FF */
SEQ(4, 0x0e, 3) /* U+0800 U+FFFF */
/* U+10000 U+10FFFF */
/* check four byte format */
if(((*ptr) >> (3)) == (0x1e)) {
/* handle upper limit *
* from RFC3629 */
if(*(ptr) >= 0xf4) {
return 0;
}
/* for each continuation byte ... */
while(nbytes < (4)) {
/* does it start correctly? */
if((*(ptr + nbytes) >> 6) != 2) {
return 0;
}
nbytes++; /* next byte */
}
ptr += (4); /* next character */
continue;
}
#undef SEQ
errno = EINVAL;
return 0; /* catch invalid byte */
}
return 1;
}
int32_t
utf8len(const unsigned char *string)
{
int len = 0, nbytes;
unsigned char *ptr = (unsigned char*) string;
while(*ptr) {
nbytes = 1;
#define SEQ(shift, result, increment) \
if(((*ptr) >> (shift)) == (result)) {\
while(nbytes < (increment)) {\
if((*(ptr + nbytes) >> 6) != 2) {\
errno = EINVAL;\
return -1;\
}\
nbytes++;\
}\
len++; ptr += (increment);\
continue;\
}
SEQ(7, 0x00, 1) /* U+0000 U+007F */
SEQ(5, 0x06, 2) /* U+0080 U+07FF */
SEQ(4, 0x0e, 3) /* U+0800 U+FFFF */
/* U+10000 U+10FFFF */
/* check four byte format */
if(((*ptr) >> (3)) == (0x1e)) {
/* handle upper limit *
* from RFC3629 */
if(*(ptr) >= 0xf4) {
return 0;
}
/* for each continuation byte ... */
while(nbytes < (4)) {
/* does it start correctly? */
if((*(ptr + nbytes) >> 6) != 2) {
errno = EINVAL;
return -1;
}
nbytes++; /* next byte */
}
len++;
ptr += (4); /* next character */
continue;
}
#undef SEQ
errno = EINVAL;
return -1;
}
return len;
}
#ifndef _UTF_8_H
#define _UTF_8_H
#include <stdint.h>
/**
*
*/
uint8_t isutf8(const unsigned char*);
/**
*
*/
int32_t utf8len(const unsigned char*);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment