Munawwar · April 13, 2021 06:25
diff --git a/regex-test.js b/regex-test.js
 const whitespace = '[\\x20\\t\\r\\n\\f]';
 const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?|(' + whitespace + ')|.)', 'ig');

 function unescOrig(str) {
  return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => {
    const high = '0x' + escaped - 0x10000;

    // NaN means non-codepoint
    // Workaround erroneous numeric interpretation of +"0x"
    // eslint-disable-next-line no-self-compare
    return high !== high || escapedWhitespace
        ? escaped
        : high < 0
            ? // BMP codepoint
            String.fromCharCode(high + 0x10000)
            : // Supplemental Plane codepoint (surrogate pair)
            String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00);
  });
 }

 function fasterRegexReplaceAll(str, regexWithGFlag, func) {
  const stashedIndex = regexWithGFlag.lastIndex;
  regexWithGFlag.lastIndex = -1;
  let output = '';
  let lastProcessedIndex = 0;
  let match;
  while ((match = regexWithGFlag.exec(str))) {
    output += str.slice(lastProcessedIndex, match.index) + func(...match, match.index, str);
    // prepare for next iteration
    lastProcessedIndex = regexWithGFlag.lastIndex;
  }
  output += str.slice(lastProcessedIndex);
  regexWithGFlag.lastIndex = stashedIndex;
  return output;
 }

 /**
 * @param {string} str 
 */
 function unescNew(str) {
  return fasterRegexReplaceAll(str, unescapeRegExp, (_, escaped, escapedWhitespace) => {
    const high = '0x' + escaped - 0x10000;

    // NaN means non-codepoint
    // Workaround erroneous numeric interpretation of +"0x"
    // eslint-disable-next-line no-self-compare
    return high !== high || escapedWhitespace
        ? escaped
        : high < 0
            ? // BMP codepoint
            String.fromCharCode(high + 0x10000)
            : // Supplemental Plane codepoint (surrogate pair)
            String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00);
  });
 }

 /**
 * 
 * @param {string} str 
 * @returns {[string, number]|undefined}
 */
 function gobbleHex(str) {
  const lower = str.toLowerCase();
  let hex = '';
  let spaceTerminated = false;
  for (let i = 0; i < 6 && lower[i] !== undefined; i++) {
      const code =  lower.charCodeAt(i);
      // check to see if we are dealing with a valid hex char [a-f|0-9]
      const valid = (code >= 97 && code <= 102) || (code >= 48 && code <= 57);
      // https://drafts.csswg.org/css-syntax/#consume-escaped-code-point
      spaceTerminated = code === 32;
      if (!valid) {
          break;
      }
      hex += lower[i];
  }

  if (hex.length === 0) {
      return undefined;
  }
  const codePoint = parseInt(hex, 16);

  const isSurrogate = codePoint >= 0xD800 && codePoint <= 0xDFFF;
  // Add special case for
  // "If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point"
  // https://drafts.csswg.org/css-syntax/#maximum-allowed-code-point
  if (isSurrogate || codePoint === 0x0000 || codePoint > 0x10FFFF) {
      return ['\uFFFD', hex.length + (spaceTerminated ? 1 : 0)];
  }

  return [
      String.fromCodePoint(codePoint),
      hex.length + (spaceTerminated ? 1 : 0),
  ];
 }

 const CONTAINS_ESCAPE = /\\/;
 function unescFast(str) {
  let needToProcess = CONTAINS_ESCAPE.test(str);
  if (!needToProcess) {
      return str;
  }
  let ret = "";

  for (let i = 0; i < str.length; i++) {
      if ((str[i] === "\\")) {
          const gobbled = gobbleHex(str.slice(i+1, i+7));
          if (gobbled !== undefined) {
              ret += gobbled[0];
              i += gobbled[1];
              continue;
          }

          // Retain a pair of \\ if double escaped `\\\\`
          // https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e
          if (str[i +1] === "\\") {
              ret += "\\";
              i++;
              continue;
          }

          // if \\ is at the end of the string retain it
          // https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb
          if (str.length === i + 1) {
              ret += str[i];
          }
          continue;
      }
    
      ret += str[i];
  }

  return ret;
 }

 // run test first
 const inputStrings = [
  '#foo',
  '#w\\+',
  '#foo\\',
  '#wow\\\\k',
  '.\\31 23',
  '.\\🐐',
  '.\\E9motion',
  '.\\E9 dition',
  '.\\0000E9dition',
  '.\\1D306',
  '.\\1D306k',
 ];
 const assertEqual = (a, b) => {
  if (a !== b) throw new Error(`${a} !== ${b}`);
 }
 const test = () => inputStrings.forEach(str => assertEqual(
  unescOrig(str),
  unescNew(str),
 ));
 test();


 function time(taskName, task, runs = 10 ** 6) {
  const start = Date.now();
  for (let i = 0; i < runs; i += 1) {
    task();
  }
  const total = Date.now() - start;
  console.log(taskName, 'Time taken =', total, 'ms');
  return total;
 }

 const origTime = time('unescOrig()', () => inputStrings.forEach(str => unescOrig(str)));
 const newTime = time('unescNew()', () => inputStrings.forEach(str => unescNew(str)));
 const fastTime = time('unescFast()', () => inputStrings.forEach(str => unescFast(str)));

 console.log('unescNew() / unescOrig() ratio =', newTime / origTime);
 console.log('unescFast() / unescOrig() ratio =', fastTime / origTime);

 // output on node 14.16.1 (osx 10.14.6, macbook pro mid-2015)
 // unescOrig() Time taken = 8285 ms
 // unescNew() Time taken = 4314 ms
 // unescFast() Time taken = 2349 ms
 // unescNew() / unescOrig() ratio = 0.5207000603500301
 // unescFast() / unescOrig() ratio = 0.28352444176222086

 // tested on node v14 and v12 on osx and ubuntu, and results are similar.
	const whitespace = '[\\x20\\t\\r\\n\\f]';
	const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?\|(' + whitespace + ')\|.)', 'ig');

	function unescOrig(str) {
	return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => {
	const high = '0x' + escaped - 0x10000;

	// NaN means non-codepoint
	// Workaround erroneous numeric interpretation of +"0x"
	// eslint-disable-next-line no-self-compare
	return high !== high \|\| escapedWhitespace
	? escaped
	: high < 0
	? // BMP codepoint
	String.fromCharCode(high + 0x10000)
	: // Supplemental Plane codepoint (surrogate pair)
	String.fromCharCode((high >> 10) \| 0xd800, (high & 0x3ff) \| 0xdc00);
	});
	}

	function fasterRegexReplaceAll(str, regexWithGFlag, func) {
	const stashedIndex = regexWithGFlag.lastIndex;
	regexWithGFlag.lastIndex = -1;
	let output = '';
	let lastProcessedIndex = 0;
	let match;
	while ((match = regexWithGFlag.exec(str))) {
	output += str.slice(lastProcessedIndex, match.index) + func(...match, match.index, str);
	// prepare for next iteration
	lastProcessedIndex = regexWithGFlag.lastIndex;
	}
	output += str.slice(lastProcessedIndex);
	regexWithGFlag.lastIndex = stashedIndex;
	return output;
	}

	/**
	* @param {string} str
	*/
	function unescNew(str) {
	return fasterRegexReplaceAll(str, unescapeRegExp, (_, escaped, escapedWhitespace) => {
	const high = '0x' + escaped - 0x10000;

	// NaN means non-codepoint
	// Workaround erroneous numeric interpretation of +"0x"
	// eslint-disable-next-line no-self-compare
	return high !== high \|\| escapedWhitespace
	? escaped
	: high < 0
	? // BMP codepoint
	String.fromCharCode(high + 0x10000)
	: // Supplemental Plane codepoint (surrogate pair)
	String.fromCharCode((high >> 10) \| 0xd800, (high & 0x3ff) \| 0xdc00);
	});
	}

	/**
	*
	* @param {string} str
	* @returns {[string, number]\|undefined}
	*/
	function gobbleHex(str) {
	const lower = str.toLowerCase();
	let hex = '';
	let spaceTerminated = false;
	for (let i = 0; i < 6 && lower[i] !== undefined; i++) {
	const code = lower.charCodeAt(i);
	// check to see if we are dealing with a valid hex char [a-f\|0-9]
	const valid = (code >= 97 && code <= 102) \|\| (code >= 48 && code <= 57);
	// https://drafts.csswg.org/css-syntax/#consume-escaped-code-point
	spaceTerminated = code === 32;
	if (!valid) {
	break;
	}
	hex += lower[i];
	}

	if (hex.length === 0) {
	return undefined;
	}
	const codePoint = parseInt(hex, 16);

	const isSurrogate = codePoint >= 0xD800 && codePoint <= 0xDFFF;
	// Add special case for
	// "If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point"
	// https://drafts.csswg.org/css-syntax/#maximum-allowed-code-point
	if (isSurrogate \|\| codePoint === 0x0000 \|\| codePoint > 0x10FFFF) {
	return ['\uFFFD', hex.length + (spaceTerminated ? 1 : 0)];
	}

	return [
	String.fromCodePoint(codePoint),
	hex.length + (spaceTerminated ? 1 : 0),
	];
	}

	const CONTAINS_ESCAPE = /\\/;
	function unescFast(str) {
	let needToProcess = CONTAINS_ESCAPE.test(str);
	if (!needToProcess) {
	return str;
	}
	let ret = "";

	for (let i = 0; i < str.length; i++) {
	if ((str[i] === "\\")) {
	const gobbled = gobbleHex(str.slice(i+1, i+7));
	if (gobbled !== undefined) {
	ret += gobbled[0];
	i += gobbled[1];
	continue;
	}

	// Retain a pair of \\ if double escaped `\\\\`
	// https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e
	if (str[i +1] === "\\") {
	ret += "\\";
	i++;
	continue;
	}

	// if \\ is at the end of the string retain it
	// https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb
	if (str.length === i + 1) {
	ret += str[i];
	}
	continue;
	}

	ret += str[i];
	}

	return ret;
	}

	// run test first
	const inputStrings = [
	'#foo',
	'#w\\+',
	'#foo\\',
	'#wow\\\\k',
	'.\\31 23',
	'.\\🐐',
	'.\\E9motion',
	'.\\E9 dition',
	'.\\0000E9dition',
	'.\\1D306',
	'.\\1D306k',
	];
	const assertEqual = (a, b) => {
	if (a !== b) throw new Error(`${a} !== ${b}`);
	}
	const test = () => inputStrings.forEach(str => assertEqual(
	unescOrig(str),
	unescNew(str),
	));
	test();


	function time(taskName, task, runs = 10 ** 6) {
	const start = Date.now();
	for (let i = 0; i < runs; i += 1) {
	task();
	}
	const total = Date.now() - start;
	console.log(taskName, 'Time taken =', total, 'ms');
	return total;
	}

	const origTime = time('unescOrig()', () => inputStrings.forEach(str => unescOrig(str)));
	const newTime = time('unescNew()', () => inputStrings.forEach(str => unescNew(str)));
	const fastTime = time('unescFast()', () => inputStrings.forEach(str => unescFast(str)));

	console.log('unescNew() / unescOrig() ratio =', newTime / origTime);
	console.log('unescFast() / unescOrig() ratio =', fastTime / origTime);

	// output on node 14.16.1 (osx 10.14.6, macbook pro mid-2015)
	// unescOrig() Time taken = 8285 ms
	// unescNew() Time taken = 4314 ms
	// unescFast() Time taken = 2349 ms
	// unescNew() / unescOrig() ratio = 0.5207000603500301
	// unescFast() / unescOrig() ratio = 0.28352444176222086

	// tested on node v14 and v12 on osx and ubuntu, and results are similar.
No results found