Skip to content

Instantly share code, notes, and snippets.

@gocha
Last active July 6, 2016 01:16
Show Gist options
  • Save gocha/2e5018a5c4d67284c9ccfd5a9cc99120 to your computer and use it in GitHub Desktop.
Save gocha/2e5018a5c4d67284c9ccfd5a9cc99120 to your computer and use it in GitHub Desktop.
strnlen for UTF-8. Considers the character boundary and an invalid sequence.
/**
* @file
* strnlen for UTF-8. Considers the character boundary and an invalid sequence.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/**
* Returns the length of the given null-terminated byte string.
* @param str pointer to the null-terminated byte string to be examined.
* @param maxlen maximum number of characters to examine.
* @return The length of the null-terminated byte string str on success, zero if str is an invalid utf8 string or NULL, maxlen if the null character was not found.
*/
size_t strnlen_utf8(const char *str, size_t maxlen) {
size_t len = 0;
if (str == NULL) {
return 0;
}
/* Calculate the length of the string. */
while (len < maxlen && str[len] != '\0') {
unsigned char tempchar = (unsigned char)str[len];
size_t charsize;
size_t off;
if (tempchar <= 0x7F) {
/* 1 byte character. */
charsize = 1;
}
else if ((tempchar & 0xE0) == 0xC0) {
/* 2 bytes character. */
charsize = 2;
}
else if ((tempchar & 0xF0) == 0xE0) {
/* 3 bytes character. */
charsize = 3;
}
else if ((tempchar & 0xF8) == 0xF0) {
/* 4 bytes character. */
charsize = 4;
}
else if ((tempchar & 0xFC) == 0xF8) {
/* 5 bytes character. */
charsize = 5;
}
else if ((tempchar & 0xFE) == 0xFC) {
/* 6 bytes character. */
charsize = 6;
}
else {
/* Character out of range. */
return 0;
}
/* Check trailer bytes. */
for (off = 1; off < charsize; off++) {
if (((unsigned char)str[len + off] & 0xC0) != 0x80) {
/* Invalid bit pattern. */
return 0;
}
}
/* Last character? */
if (len + charsize > maxlen) {
return len;
}
/* Otherwise, go next. */
len += charsize;
}
/* Return the length. */
return (len <= maxlen) ? len : maxlen;
}
int main(void) {
unsigned char s[] = {
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0
};
printf("%u\n", strnlen_utf8((char *)s, 14));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment