Last active
July 6, 2016 01:16
-
-
Save gocha/2e5018a5c4d67284c9ccfd5a9cc99120 to your computer and use it in GitHub Desktop.
strnlen for UTF-8. Considers the character boundary and an invalid sequence.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file | |
* strnlen for UTF-8. Considers the character boundary and an invalid sequence. | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
/** | |
* Returns the length of the given null-terminated byte string. | |
* @param str pointer to the null-terminated byte string to be examined. | |
* @param maxlen maximum number of characters to examine. | |
* @return The length of the null-terminated byte string str on success, zero if str is an invalid utf8 string or NULL, maxlen if the null character was not found. | |
*/ | |
size_t strnlen_utf8(const char *str, size_t maxlen) { | |
size_t len = 0; | |
if (str == NULL) { | |
return 0; | |
} | |
/* Calculate the length of the string. */ | |
while (len < maxlen && str[len] != '\0') { | |
unsigned char tempchar = (unsigned char)str[len]; | |
size_t charsize; | |
size_t off; | |
if (tempchar <= 0x7F) { | |
/* 1 byte character. */ | |
charsize = 1; | |
} | |
else if ((tempchar & 0xE0) == 0xC0) { | |
/* 2 bytes character. */ | |
charsize = 2; | |
} | |
else if ((tempchar & 0xF0) == 0xE0) { | |
/* 3 bytes character. */ | |
charsize = 3; | |
} | |
else if ((tempchar & 0xF8) == 0xF0) { | |
/* 4 bytes character. */ | |
charsize = 4; | |
} | |
else if ((tempchar & 0xFC) == 0xF8) { | |
/* 5 bytes character. */ | |
charsize = 5; | |
} | |
else if ((tempchar & 0xFE) == 0xFC) { | |
/* 6 bytes character. */ | |
charsize = 6; | |
} | |
else { | |
/* Character out of range. */ | |
return 0; | |
} | |
/* Check trailer bytes. */ | |
for (off = 1; off < charsize; off++) { | |
if (((unsigned char)str[len + off] & 0xC0) != 0x80) { | |
/* Invalid bit pattern. */ | |
return 0; | |
} | |
} | |
/* Last character? */ | |
if (len + charsize > maxlen) { | |
return len; | |
} | |
/* Otherwise, go next. */ | |
len += charsize; | |
} | |
/* Return the length. */ | |
return (len <= maxlen) ? len : maxlen; | |
} | |
int main(void) { | |
unsigned char s[] = { | |
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0 | |
}; | |
printf("%u\n", strnlen_utf8((char *)s, 14)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment