Last active
February 28, 2024 07:11
-
-
Save genki/e86e4907d0f5ed04340ab0ec55250499 to your computer and use it in GitHub Desktop.
The Uint8Array serializer over valid UTF-16 string.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// pack bytes into valid UTF-16 string | |
// | |
// strategy: | |
// | |
// * using ESC as the escape character | |
// * if there is ESC in the bytes, double it | |
// * if there is unmatched surrogate pair, mark it by the escape character | |
// | |
// 0x007f: escape, because it's rare but still only one utf-8 byte. | |
// To escape itself, use 0x007f 0x08ff (two bytes utf-8) | |
// 0x0000->0x001f: converted to ESC + 0x0020->0x003f (two bytes utf-8) | |
// unmatched pairs: converted to ESC + (code-0xd800+0x0040) 0x0040->0x083f | |
// (two-four bytes utf-8) | |
// BOM: ESC + 0x08fe (four bytes utf-8) | |
// | |
// If the length of the bytes is odd, the last byte XX is put after the escape | |
// character as 0xFFXX. | |
// | |
const SURROGATE_OFFSET = 0xD800 - 0x0040; | |
const ESC = 0x007f; | |
const encode = (c:number) => { | |
if (c >= 0xDC00 && c <= 0xDFFF) { | |
// unmatched low surrogate | |
return String.fromCharCode(ESC, c - SURROGATE_OFFSET); | |
} | |
// escape the BOM | |
if (c === 0xFEFF) { | |
return String.fromCharCode(ESC, 0x08FE); | |
} | |
// extra compaction against the stringify of the control characters | |
if (c <= 0x001f) { | |
return String.fromCharCode(ESC, c + 0x0020); | |
} | |
// double the escape character | |
if (c === ESC) { | |
return String.fromCharCode(ESC, 0x08FF); | |
} | |
// normal codepoint | |
return String.fromCharCode(c); | |
}; | |
export const packUint8Array = (bytes:Uint8Array) => { | |
let code = ''; | |
let surrogate:number|undefined; | |
let low = true; | |
let c = 0; | |
for (const b of bytes) { | |
if (low) { | |
c = b; | |
low = false; | |
continue; | |
} | |
c |= b << 8; | |
low = true; | |
if (surrogate !== undefined) { | |
if (c >= 0xDC00 && c <= 0xDFFF) { | |
// valid surrogate pair | |
code += String.fromCharCode(surrogate, c); | |
surrogate = undefined; | |
continue; | |
} else { | |
// surrogate was unmatched high surrogate, so escape it | |
code += String.fromCharCode(ESC, surrogate - SURROGATE_OFFSET); | |
surrogate = undefined; | |
} | |
} | |
if (c >= 0xD800 && c <= 0xDBFF) { | |
surrogate = c; | |
continue; | |
} | |
code += encode(c); | |
} | |
if (surrogate) { | |
const x = surrogate - SURROGATE_OFFSET; | |
code += String.fromCharCode(ESC, x); | |
} | |
if (!low && bytes.length > 0) { | |
code += encode(c) + String.fromCharCode(ESC); | |
} | |
return code; | |
}; | |
// unpack encoded valid UTF-16 string into Uint8Array | |
export const unpackUint8Array = (code:string) => { | |
const bytes = new Uint8Array(code.length*2); | |
let j = 0; | |
let escaped = false; | |
for (const s of code) { | |
const c = s.charCodeAt(0); | |
if (!escaped) { | |
if (c === ESC) { | |
escaped = true; | |
} else { | |
// normal codepoint | |
bytes[j++] = c & 0xff; | |
bytes[j++] = c >>> 8; | |
if (c >= 0xD800 && c <= 0xDBFF) { | |
const d = s.charCodeAt(1); | |
bytes[j++] = d & 0xff; | |
bytes[j++] = d >>> 8; | |
} | |
} | |
continue; | |
} | |
// escaped character | |
if (c < 0x0040) { | |
// restore the control characters | |
bytes[j++] = (c - 0x0020) & 0xff; | |
bytes[j++] = c >>> 8; | |
} else if (c <= 0xDFFF - SURROGATE_OFFSET) { | |
// restore the escaped unmatched surrogate | |
const x = c + SURROGATE_OFFSET; | |
bytes[j++] = x & 0xff; | |
bytes[j++] = x >>> 8; | |
} else if (c === 0x08FE) { // restore the BOM | |
bytes[j++] = 0xFF; | |
bytes[j++] = 0xFE; | |
} else { // restore the ESC | |
bytes[j++] = ESC; | |
j++; | |
} | |
escaped = false; | |
} | |
// if ended while escaped, the length is odd | |
if (escaped) j--; | |
return bytes.subarray(0, j); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This serializer can serialize arbitrary byte arrays into the valid UTF-16 string.
As of the JavaScript uses UTF-16 string internally, it is useful for some situations.
For example,