jchook · December 24, 2022 06:34
diff --git a/unicode.js b/unicode.js
 /**
 * Convert a string to a unicode byte array
 * @param {string} str
 * @return {Array} of bytes
 */
 export function strToUtf8Bytes(str) {
  const utf8 = [];
  for (let ii = 0; ii < str.length; ii++) {
    let charCode = str.charCodeAt(ii);
    if (charCode < 0x80) utf8.push(charCode);
    else if (charCode < 0x800) {
      utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
    } else if (charCode < 0xd800 || charCode >= 0xe000) {
      utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
    } else {
      ii++;
      // Surrogate pair:
      // UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
      // splitting the 20 bits of 0x0-0xFFFFF into two halves
      charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
      utf8.push(
        0xf0 | (charCode >> 18),
        0x80 | ((charCode >> 12) & 0x3f),
        0x80 | ((charCode >> 6) & 0x3f),
        0x80 | (charCode & 0x3f),
      );
    }
  }
  return utf8;
 }

 /**
 * How many unicode bytes per character?
 * @param {string} str
 * @return {Array} of integers, with length == str.length
 */
 export function utf8BytesPerChar(str) {
  const utf8 = [];
  for (let ii = 0; ii < str.length; ii++) {
    const charCode = str.charCodeAt(ii);
    if (charCode < 0x80) {
      utf8.push(1);
    } else if (charCode < 0x800) {
      utf8.push(2);
    } else if (charCode < 0xd800 || charCode >= 0xe000) {
      utf8.push(3);
    } else {
      ii++;
      // Surrogate pair
      utf8.push(2);
      utf8.push(2);
    }
  }
  return utf8;
 }

 /**
 * Length of a string in UTF8 bytes (useful for dealing with data from PHP)
 * @param {string} str
 * @return {number}
 */
 export function strlen(str) {
  return utf8BytesPerChar(str).reduce((acc, cur) => acc + cur, 0);
 }

 /**
 * Get a substring using string lengths similar to PHP's substr() and strlen()
 * @param {string} str
 * @param {number} start
 * @param {number} end
 * @return {string}
 */
 export function substring(str, start, end) {
  const bytesPerChar = utf8BytesPerChar(str);
  const strLen = str.length;
  let ii = 0;
  let realStart = 0;
  let realEnd = strLen;
  let utf8Pos = 0;
  // Find the utf-16 start/end location, aka the "real" start/end
  if (start) {
    for (; ii < strLen && utf8Pos <= start; ii++) {
      utf8Pos += bytesPerChar[ii];
      realStart = ii;
    }
  }
  if (end) {
    for (; ii <= strLen && utf8Pos <= end; ii++) {
      utf8Pos += bytesPerChar[ii];
      realEnd = ii;
    }
  }
  return str.substring(realStart, realEnd);
 }
	/**
	* Convert a string to a unicode byte array
	* @param {string} str
	* @return {Array} of bytes
	*/
	export function strToUtf8Bytes(str) {
	const utf8 = [];
	for (let ii = 0; ii < str.length; ii++) {
	let charCode = str.charCodeAt(ii);
	if (charCode < 0x80) utf8.push(charCode);
	else if (charCode < 0x800) {
	utf8.push(0xc0 \| (charCode >> 6), 0x80 \| (charCode & 0x3f));
	} else if (charCode < 0xd800 \|\| charCode >= 0xe000) {
	utf8.push(0xe0 \| (charCode >> 12), 0x80 \| ((charCode >> 6) & 0x3f), 0x80 \| (charCode & 0x3f));
	} else {
	ii++;
	// Surrogate pair:
	// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
	// splitting the 20 bits of 0x0-0xFFFFF into two halves
	charCode = 0x10000 + (((charCode & 0x3ff) << 10) \| (str.charCodeAt(ii) & 0x3ff));
	utf8.push(
	0xf0 \| (charCode >> 18),
	0x80 \| ((charCode >> 12) & 0x3f),
	0x80 \| ((charCode >> 6) & 0x3f),
	0x80 \| (charCode & 0x3f),
	);
	}
	}
	return utf8;
	}

	/**
	* How many unicode bytes per character?
	* @param {string} str
	* @return {Array} of integers, with length == str.length
	*/
	export function utf8BytesPerChar(str) {
	const utf8 = [];
	for (let ii = 0; ii < str.length; ii++) {
	const charCode = str.charCodeAt(ii);
	if (charCode < 0x80) {
	utf8.push(1);
	} else if (charCode < 0x800) {
	utf8.push(2);
	} else if (charCode < 0xd800 \|\| charCode >= 0xe000) {
	utf8.push(3);
	} else {
	ii++;
	// Surrogate pair
	utf8.push(2);
	utf8.push(2);
	}
	}
	return utf8;
	}

	/**
	* Length of a string in UTF8 bytes (useful for dealing with data from PHP)
	* @param {string} str
	* @return {number}
	*/
	export function strlen(str) {
	return utf8BytesPerChar(str).reduce((acc, cur) => acc + cur, 0);
	}

	/**
	* Get a substring using string lengths similar to PHP's substr() and strlen()
	* @param {string} str
	* @param {number} start
	* @param {number} end
	* @return {string}
	*/
	export function substring(str, start, end) {
	const bytesPerChar = utf8BytesPerChar(str);
	const strLen = str.length;
	let ii = 0;
	let realStart = 0;
	let realEnd = strLen;
	let utf8Pos = 0;
	// Find the utf-16 start/end location, aka the "real" start/end
	if (start) {
	for (; ii < strLen && utf8Pos <= start; ii++) {
	utf8Pos += bytesPerChar[ii];
	realStart = ii;
	}
	}
	if (end) {
	for (; ii <= strLen && utf8Pos <= end; ii++) {
	utf8Pos += bytesPerChar[ii];
	realEnd = ii;
	}
	}
	return str.substring(realStart, realEnd);
	}