Last active
December 24, 2022 06:34
-
-
Save jchook/f665a0d5096ab0283c3f51bab57ff132 to your computer and use it in GitHub Desktop.
JavaScript UTF-8 Helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Convert a string to a unicode byte array | |
* @param {string} str | |
* @return {Array} of bytes | |
*/ | |
export function strToUtf8Bytes(str) { | |
const utf8 = []; | |
for (let ii = 0; ii < str.length; ii++) { | |
let charCode = str.charCodeAt(ii); | |
if (charCode < 0x80) utf8.push(charCode); | |
else if (charCode < 0x800) { | |
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f)); | |
} else if (charCode < 0xd800 || charCode >= 0xe000) { | |
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f)); | |
} else { | |
ii++; | |
// Surrogate pair: | |
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and | |
// splitting the 20 bits of 0x0-0xFFFFF into two halves | |
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff)); | |
utf8.push( | |
0xf0 | (charCode >> 18), | |
0x80 | ((charCode >> 12) & 0x3f), | |
0x80 | ((charCode >> 6) & 0x3f), | |
0x80 | (charCode & 0x3f), | |
); | |
} | |
} | |
return utf8; | |
} | |
/** | |
* How many unicode bytes per character? | |
* @param {string} str | |
* @return {Array} of integers, with length == str.length | |
*/ | |
export function utf8BytesPerChar(str) { | |
const utf8 = []; | |
for (let ii = 0; ii < str.length; ii++) { | |
const charCode = str.charCodeAt(ii); | |
if (charCode < 0x80) { | |
utf8.push(1); | |
} else if (charCode < 0x800) { | |
utf8.push(2); | |
} else if (charCode < 0xd800 || charCode >= 0xe000) { | |
utf8.push(3); | |
} else { | |
ii++; | |
// Surrogate pair | |
utf8.push(2); | |
utf8.push(2); | |
} | |
} | |
return utf8; | |
} | |
/** | |
* Length of a string in UTF8 bytes (useful for dealing with data from PHP) | |
* @param {string} str | |
* @return {number} | |
*/ | |
export function strlen(str) { | |
return utf8BytesPerChar(str).reduce((acc, cur) => acc + cur, 0); | |
} | |
/** | |
* Get a substring using string lengths similar to PHP's substr() and strlen() | |
* @param {string} str | |
* @param {number} start | |
* @param {number} end | |
* @return {string} | |
*/ | |
export function substring(str, start, end) { | |
const bytesPerChar = utf8BytesPerChar(str); | |
const strLen = str.length; | |
let ii = 0; | |
let realStart = 0; | |
let realEnd = strLen; | |
let utf8Pos = 0; | |
// Find the utf-16 start/end location, aka the "real" start/end | |
if (start) { | |
for (; ii < strLen && utf8Pos <= start; ii++) { | |
utf8Pos += bytesPerChar[ii]; | |
realStart = ii; | |
} | |
} | |
if (end) { | |
for (; ii <= strLen && utf8Pos <= end; ii++) { | |
utf8Pos += bytesPerChar[ii]; | |
realEnd = ii; | |
} | |
} | |
return str.substring(realStart, realEnd); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment