Skip to content

Instantly share code, notes, and snippets.

@chadbrewbaker
Last active November 1, 2021 09:42
Show Gist options
  • Save chadbrewbaker/5ec5fbe06d294da95b15d17b70b4d4a3 to your computer and use it in GitHub Desktop.
Save chadbrewbaker/5ec5fbe06d294da95b15d17b70b4d4a3 to your computer and use it in GitHub Desktop.
Top UTF-8 Decoders
//Hideiously bad decompilation of mbrtowc in MS libc.
#include <stdbool.h>
#define ushort unsigned short
#define longlong long long
#define uint unsigned int
#define byte unsigned char
#define ulonglong unsigned long long
void FUN_1800468c0(ushort *param_1,undefined8 *param_2)
{
if (param_1 != (ushort *)0x0) {
*param_1 = (ushort)*(undefined4 *)param_2 & 0x3ff | 0xdc00;
}
FUN_180041b10(0xfffffffffffffffd,param_2);
return;
}
ulonglong FUN_180046910(longlong param_1)
{
return (ulonglong)(*(short *)(param_1 + 6) == -1);
}
longlong FUN_180046760(ushort *param_1,byte *param_2,byte *param_3,undefined8 *param_4)
{
ulonglong uVar1;
undefined8 *local_res20;
uint local_18 [2];
longlong local_10;
local_res20 = param_4;
if (param_4 == (undefined8 *)0x0) {
local_res20 = (undefined8 *)&DAT_1801b0040;
}
uVar1 = FUN_180046910((longlong)local_res20);
if ((uVar1 & 0xff) == 0) {
local_10 = FUN_180046990(local_18,param_2,param_3,local_res20);
if (((param_2 != (byte *)0x0) && (local_10 != -1)) && (local_10 != -2)) {
if (local_18[0] < 0x110000) {
if (local_18[0] < 0x10000) {
if (param_1 != (ushort *)0x0) {
*param_1 = (ushort)local_18[0];
}
local_10 = FUN_180041b10(local_10,local_res20);
}
else {
local_10 = FUN_180046860(param_1,local_18[0],local_10,(int *)local_res20);
}
}
else {
local_10 = FUN_180041b50(local_res20);
}
}
}
else {
local_10 = FUN_1800468c0(param_1,local_res20);
}
return local_10;
}
void mbrtoc16(ushort *param_1,byte *param_2,byte *param_3,undefined8 *param_4)
{
/* 0x46950 2362 mbrtoc16 */
FUN_180046760(param_1,param_2,param_3,param_4);
return;
}
void FUN_1801529a0(longlong param_1)
{
if ((param_1 == DAT_1801afff0) && ((short)((ulonglong)param_1 >> 0x30) == 0)) {
return;
}
__report_gsfailure();
return;
}
undefined8 FUN_180041b10(undefined8 param_1,undefined8 *param_2)
{
longlong lVar1;
undefined8 *puVar2;
undefined8 local_18 [2];
lVar1 = 8;
puVar2 = local_18;
while (lVar1 != 0) {
lVar1 = lVar1 + -1;
*(undefined *)puVar2 = 0;
puVar2 = (undefined8 *)((undefined *)puVar2 + 1);
}
*param_2 = local_18[0];
return param_1;
}
undefined8 FUN_180041b50(undefined8 *param_1)
{
int *piVar1;
longlong lVar2;
undefined8 *puVar3;
undefined8 local_18 [2];
lVar2 = 8;
puVar3 = local_18;
while (lVar2 != 0) {
lVar2 = lVar2 + -1;
*(undefined *)puVar3 = 0;
puVar3 = (undefined8 *)((longlong)puVar3 + 1);
}
*param_1 = local_18[0];
piVar1 = _errno();
*piVar1 = 0x2a;
return 0xffffffffffffffff;
}
void FUN_180046990(uint *param_1,byte *param_2,byte *param_3,undefined8 *param_4)
{
uint *local_res8;
byte *local_res10;
byte *local_res18;
undefined8 *local_res20;
undefined auStack104 [32];
byte local_48;
byte local_47;
byte local_46;
bool local_45;
byte local_44;
uint local_40;
bool local_3c;
byte local_3b;
uint local_38;
uint local_34;
byte *local_30;
uint local_28 [4];
ulonglong local_18;
local_18 = DAT_1801afff0 ^ (ulonglong)auStack104;
local_30 = param_2;
local_res20 = param_4;
if (param_4 == (undefined8 *)0x0) {
local_res20 = (undefined8 *)&DAT_1801b0048;
}
local_res8 = param_1;
local_res10 = param_2;
local_res18 = param_3;
if (param_2 == (byte *)0x0) {
local_res10 = &DAT_18015e078;
local_res18 = (byte *)0x1;
local_res8 = (uint *)0x0;
}
if (local_res18 != (byte *)0x0) {
local_45 = *(short *)((longlong)local_res20 + 6) == 0;
local_3c = local_45;
if (local_45) {
local_46 = *local_res10;
local_res10 = local_res10 + 1;
if ((local_46 & 0x80) == 0) {
if (local_res8 != (uint *)0x0) {
*local_res8 = (uint)local_46;
}
local_38 = (uint)(local_46 != 0);
goto LAB_180046d31;
}
if ((local_46 & 0xe0) == 0xc0) {
local_48 = 2;
}
else {
if ((local_46 & 0xf0) == 0xe0) {
local_48 = 3;
}
else {
if ((local_46 & 0xf8) != 0xf0) {
FUN_180041b50(local_res20);
goto LAB_180046d31;
}
local_48 = 4;
}
}
local_47 = local_48;
local_40 = (uint)local_46 & (1 << (7 - local_48 & 0x1f)) - 1U;
}
else {
local_40 = *(uint *)local_res20;
local_48 = *(byte *)((longlong)local_res20 + 4);
local_47 = *(byte *)((longlong)local_res20 + 6);
if ((((local_48 < 2) || (4 < local_48)) || (local_47 == 0)) || (local_48 <= local_47)) {
FUN_180041b50(local_res20);
goto LAB_180046d31;
}
}
if ((byte *)(ulonglong)local_47 < local_res18) {
local_res18 = (byte *)(ulonglong)local_47;
}
while (local_res10 + -(longlong)param_2 < local_res18) {
local_44 = *local_res10;
local_res10 = local_res10 + 1;
if ((local_44 & 0xc0) != 0x80) {
FUN_180041b50(local_res20);
goto LAB_180046d31;
}
local_40 = local_40 << 6 | (uint)local_44 & 0x3f;
}
if (local_res18 < (byte *)(ulonglong)local_47) {
local_3b = local_47 - (char)local_res18;
*(uint *)local_res20 = local_40;
*(ushort *)((longlong)local_res20 + 4) = (ushort)local_48;
*(ushort *)((longlong)local_res20 + 6) = (ushort)local_3b;
}
else {
if (((local_40 < 0xd800) || (0xdfff < local_40)) && (local_40 < 0x110000)) {
local_28[0] = 0x80;
local_28[1] = 0x800;
local_28[2] = 0x10000;
if (local_40 < local_28[(int)((uint)local_48 - 2)]) {
FUN_180041b50(local_res20);
}
else {
if (local_res8 != (uint *)0x0) {
*local_res8 = local_40;
}
if (local_40 == 0) {
local_34 = 0;
}
else {
local_34 = (uint)local_47;
}
FUN_180041b10((longlong)(int)local_34,local_res20);
}
}
else {
FUN_180041b50(local_res20);
}
}
}
LAB_180046d31:
FUN_1801529a0(local_18 ^ (ulonglong)auStack104);
return;
}
void mbrtoc32(uint *param_1,byte *param_2,byte *param_3,undefined8 *param_4)
{
/* 0x46d50 2363 mbrtoc32 */
FUN_180046990(param_1,param_2,param_3,param_4);
return;
}

Chrome UTF-8 Decoder

NGINX UTF-8 Javascript

Chrome UTF-8 DFA

Rust UTF-8 Decoder/validator

MUSL mbrtowc

SpiderMonkey Unicode seems to be using harfbuzz heavily. Harfbuzz unicode classification seems to be the most up to date.

Open JDK UTF8

GNU/BSD wc are using

#include <locale.h>
#include <wchar.h>

mbstate_t state;

//expands characters from narow to wide format based on locale
size_t mbrtowc( wchar_t *restrict pwc, const char *restrict s, size_t n,
                mbstate_t *restrict ps );

glibc mbrtowc

latest Apple libc which calls a locale specific mbrtowc

.Net Runtime UTF8 on Windows 10 the path for the debug binary of the C library is C:\WINDOWS\system32\ucrtbased.dll.

@aytey
Copy link

aytey commented Aug 26, 2021

Interesting decompilation, @chadbrewbaker -- did you get this from IDA/Ghidra/r2?

@chadbrewbaker
Copy link
Author

Ghidra and some hand tweaks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment