Created
February 1, 2018 02:13
-
-
Save libsteve/e341760c0a7d5cf4e113b136277e5ad4 to your computer and use it in GitHub Desktop.
The proper way to reverse the code points in a UTF-8 encoded string.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <assert.h> | |
/// Count the amount of bytes contained within a null-terminated string. | |
/// - returns: The amount of bits before the null-terminator in the given string. | |
int string_length(char const *const string); | |
/// Reverse an array of bytes in-place. | |
/// - parameter bytes: A buffer of bytes to be reversed. | |
/// - parameter length: The amount of bytes contained within the buffer that should be reversed. | |
void bytes_reverse(char *const bytes, int const length); | |
/// Copy an array of bytes into another byte array. | |
/// - parameter source: An array of bytes that should be copied. | |
/// - parameter length: The amount of bytes to copy. | |
/// - parameter destination: A byte buffer to copy the bytes into. | |
void bytes_copy(char const *const source, int const length, char *const destintaion); | |
/// Create a string representation of the eight bits in a byte. | |
/// - parameter string: The byte buffer into which the string representation should by written. | |
/// - parameter byte: The byte to represent as a string. | |
void string_print_byte(char *const string, char const byte); | |
/// Given a UTF-8 encoded string, determine the amount of bytes used to represent the first codepoint. | |
/// - parameter string: A UTF-8 encoded character array. | |
/// - returns: | |
/// * 0 when the fist byte is a continuation byte. | |
/// * 1, 2, 3, or 4 for any valid UTF-8 codepoint. | |
/// * -1 when the first byte is invalid in UTF-8. | |
int utf8_string_next_codepoint_length(char const *const string); | |
/// Count the amount of codepoints within a given UTF-8 encoded string. | |
/// - parameter string: A valid UTF-8 encoded null-terminated byte string. | |
/// - returns: The amount of codepoints found before the null-terminator. | |
/// An invalid UTF-8 string will return a length of -1. | |
int utf8_string_codepoint_length(char const *const string); | |
/// Reverse in-place the codepoints of UTF-8 encoded string. | |
/// - parameter string: A valid UTF-8 encoded string. | |
void utf8_string_codepoint_reverse(char *const string); | |
int main(int argc, char **argv) { | |
char cent[] = "\xc2\xa2"; | |
char euro[] = "\xe2\x82\xac"; | |
char clock[] = "\xe2\x8f\xb0"; | |
char circle[] = "\xf0\x90\x8d\x88"; | |
char letter[] = "a"; | |
char sentence[] = "What --- is the ---- turning the --- into --?"; | |
bytes_copy(clock, 3, sentence + 5); | |
bytes_copy(circle, 4, sentence + 16); | |
bytes_copy(euro, 3, sentence + 33); | |
bytes_copy(cent, 2, sentence + 42); | |
printf("%s\n", sentence); | |
utf8_string_codepoint_reverse(sentence); | |
printf("%s\n", sentence); | |
} | |
int string_length(char const *const string) { | |
assert( string != NULL ); | |
int length = 0; | |
for (; string[length] != '\0'; length += 1) {} | |
return length; | |
} | |
void bytes_reverse(char *const bytes, int const length) { | |
assert( bytes != NULL ); | |
assert( length > 0 ); | |
for (int start = 0, end = length - 1; start < end; start += 1, end -= 1) { | |
char byte = bytes[end]; | |
bytes[end] = bytes[start]; | |
bytes[start] = byte; | |
} | |
} | |
void bytes_copy(char const *const source, int const length, char *const destination) { | |
assert( source != NULL ); | |
assert( destination != NULL ); | |
for (int i = 0; i < length; i += 1) { | |
destination[i] = source[i]; | |
} | |
} | |
void string_print_byte(char *const string, char byte) { | |
assert( string != NULL ); | |
char byte_string[] = { (byte & 0x80) >> 7, | |
(byte & 0x40) >> 6, | |
(byte & 0x20) >> 5, | |
(byte & 0x10) >> 4, | |
(byte & 0x08) >> 3, | |
(byte & 0x06) >> 2, | |
(byte & 0x02) >> 1, | |
(byte & 0x01) }; | |
bytes_copy(byte_string, sizeof(byte_string), string); | |
} | |
int utf8_string_next_codepoint_length(char const *const string) { | |
assert( string != NULL ); | |
char c = string[0]; | |
if ((c & 0x80) == 0) return 1; // 0....... stand-alone codepoint | |
if ((c & 0x40) == 0) return 0; // 10...... continuation byte | |
if ((c & 0x20) == 0) return 2; // 110..... two-byte codepoint initial byte | |
if ((c & 0x10) == 0) return 3; // 1110.... three-byte codepoint initial byte | |
if ((c & 0x08) == 0) return 4; // 11110... four-byte codepoint initial byte | |
return -1; | |
} | |
int utf8_string_codepoint_length(char const *const string) { | |
int length = 0; | |
int const raw_length = string_length(string); | |
for (int i = 0, char_len = 0; i < raw_length; i += char_len, length += 1) { | |
char_len = utf8_string_next_codepoint_length(string + i); | |
if (char_len <= 0) { return -1; } | |
} | |
return length; | |
} | |
void utf8_string_codepoint_reverse(char *const string) { | |
int length = string_length(string); | |
for (int i = 0, char_len = 0; i < length; i += char_len) { | |
char_len = utf8_string_next_codepoint_length(string + i); | |
assert( char_len > 0 && char_len < 5 ); | |
bytes_reverse(string + i, char_len); | |
} | |
bytes_reverse(string, length); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment