-
-
Save jasper2007111/86e5a7317c4d0e5c35ed71529467cfa0 to your computer and use it in GitHub Desktop.
UTF-8 to UTF-32 converter in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "unicode.h" | |
#include <stdint.h> | |
#include <stddef.h> | |
int is_valid_char(uint32_t ch) | |
{ | |
return ch < 0xd800 || ch > 0xdfff; | |
} | |
int is_combo_char(uint32_t ch) | |
{ | |
return (ch >= 0x0300 && ch <= 0x036f) | |
|| (ch >= 0x20d0 && ch <= 0x20ff) | |
|| (ch >= 0xfe20 && ch <= 0xfe2f); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef UNICODE_H | |
#define UNICODE_H | |
#include <stdint.h> | |
#include <stddef.h> | |
struct character { | |
uint32_t codepoints[]; | |
size_t count; | |
}; | |
int is_valid_char(uint32_t ch); | |
int is_combo_char(uint32_t ch); | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "utf16.h" | |
#include <stdint.h> | |
#include <stddef.h> | |
static int getch(uint16_t buf[], unsigned long *idx, size_t strlen, | |
uint32_t *cp) | |
{ | |
if (*idx >= strlen) { | |
return -1; | |
} | |
uint16_t ch = buf[(*idx)++]; | |
if ((ch & 0xfc00) != 0xd800) { | |
*cp = (uint32_t)ch; | |
return 0; | |
} | |
if (*idx > strlen) { | |
return -1; | |
} | |
uint16_t nxt = buf[(*idx)++]; | |
if ((nxt & 0xfc00) != 0xdc00) { | |
return -1; | |
} | |
*cp = ((ch & 0x03ff) << 10) | (nxt & 0x03ff); | |
return 0; | |
} | |
int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size) | |
{ | |
unsigned long idx = 0; | |
for (*out_size = 0; *out_size < strlen; ++*out_size) { | |
uint32_t cp; | |
getch(chars, &idx, strlen, &cp); | |
if (!is_valid_char(cp)) { | |
return -1; | |
} | |
} | |
return 0; | |
} | |
int utf16_to_utf32(uint16_t input[], uint32_t output[], | |
size_t count, size_t *out_size) | |
{ | |
unsigned long idx = 0; | |
for (*out_size = 0; *out_size < count; ++*out_size) { | |
getch(input, &idx, count, &output[i]); | |
if (!is_valid_char(output[i])) { | |
return -1; | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef UTF16_H | |
#define UTF16_H | |
#include <stdint.h> | |
#include <stddef.h> | |
#include "unicode.h" | |
int utf16_codepoint_count(uint16_t chars[], size_t strlen, size_t *out_size); | |
int utf16_to_utf32(uint16_t input[], uint32_t output[], size_t count, | |
size_t *out_size); | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "utf32.h" | |
int utf32_getchars(uint32_t utf32[], struct character output[], | |
size_t count, size_t *out_size) | |
{ | |
for (i = 0; i < count; ++*out_size) { | |
size_t cur_size = 1; | |
output[*out_size].codepoints = &utf32[i]; | |
for (; ++i < count && is_combo_char(utf32[i]); ++cur_size) { | |
if (*out_size == 0) { | |
return -1; | |
} | |
} | |
output[*out_size].count = cur_size; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef UTF32_H | |
#define UTF32_H | |
#include <stdint.h> | |
#include <stddef.h> | |
#include "unicode.h" | |
int utf32_getchars(uint32_t utf32[], struct character output[], | |
size_t count, size_t *out_size); | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "utf8.h" | |
#include <stdint.h> | |
#include <stddef.h> | |
static int getch(uint8_t buf[], unsigned long *idx, size_t strlen, uint32_t *cp) | |
{ | |
int remunits; | |
uint8_t nxt, msk; | |
if (*idx >= strlen) | |
return -1; | |
nxt = buf[(*idx)++]; | |
if (nxt & 0x80) { | |
msk = 0xe0; | |
for (remunits = 1; (nxt & msk) != (msk << 1); ++remunits) | |
msk = (msk >> 1) | 0x80; | |
} else { | |
remunits = 0; | |
msk = 0; | |
} | |
*cp = nxt ^ msk; | |
while (remunits-- > 0) { | |
*cp <<= 6; | |
if (*idx >= strlen) | |
return -1; | |
*cp |= buf[(*idx)++] & 0x3f; | |
} | |
return 0; | |
} | |
int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size) | |
{ | |
unsigned long idx = 0; | |
for (*out_size = 0; *out_size < strlen; ++*out_size) { | |
uint32_t cp; | |
getch(chars, &idx, strlen, &cp); | |
if (!is_valid_char(cp)) { | |
return -1; | |
} | |
} | |
return 0; | |
} | |
int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count, | |
size_t *out_size) | |
{ | |
unsigned long idx = 0; | |
for (*out_size = 0; *out_size < count; ++*out_size) { | |
getch(input, &idx, count, &output[i]); | |
if (!is_valid_char(output[i])) { | |
return -1; | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef UTF8_H | |
#define UTF8_H | |
#include <stdint.h> | |
#include <stddef.h> | |
#include "unicode.h" | |
int utf8_codepoint_count(uint8_t chars[], size_t strlen, size_t *out_size); | |
int utf8_to_utf32(uint8_t input[], uint32_t output[], size_t count, | |
size_t *out_size); | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment