Last active
April 13, 2021 06:25
-
-
Save Munawwar/af9432d15118935e1e7cca163f0e4648 to your computer and use it in GitHub Desktop.
Testing the speed of regex replace approach
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const whitespace = '[\\x20\\t\\r\\n\\f]'; | |
const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?|(' + whitespace + ')|.)', 'ig'); | |
function unescOrig(str) { | |
return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => { | |
const high = '0x' + escaped - 0x10000; | |
// NaN means non-codepoint | |
// Workaround erroneous numeric interpretation of +"0x" | |
// eslint-disable-next-line no-self-compare | |
return high !== high || escapedWhitespace | |
? escaped | |
: high < 0 | |
? // BMP codepoint | |
String.fromCharCode(high + 0x10000) | |
: // Supplemental Plane codepoint (surrogate pair) | |
String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00); | |
}); | |
} | |
function fasterRegexReplaceAll(str, regexWithGFlag, func) { | |
const stashedIndex = regexWithGFlag.lastIndex; | |
regexWithGFlag.lastIndex = -1; | |
let output = ''; | |
let lastProcessedIndex = 0; | |
let match; | |
while ((match = regexWithGFlag.exec(str))) { | |
output += str.slice(lastProcessedIndex, match.index) + func(...match, match.index, str); | |
// prepare for next iteration | |
lastProcessedIndex = regexWithGFlag.lastIndex; | |
} | |
output += str.slice(lastProcessedIndex); | |
regexWithGFlag.lastIndex = stashedIndex; | |
return output; | |
} | |
/** | |
* @param {string} str | |
*/ | |
function unescNew(str) { | |
return fasterRegexReplaceAll(str, unescapeRegExp, (_, escaped, escapedWhitespace) => { | |
const high = '0x' + escaped - 0x10000; | |
// NaN means non-codepoint | |
// Workaround erroneous numeric interpretation of +"0x" | |
// eslint-disable-next-line no-self-compare | |
return high !== high || escapedWhitespace | |
? escaped | |
: high < 0 | |
? // BMP codepoint | |
String.fromCharCode(high + 0x10000) | |
: // Supplemental Plane codepoint (surrogate pair) | |
String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00); | |
}); | |
} | |
/** | |
* | |
* @param {string} str | |
* @returns {[string, number]|undefined} | |
*/ | |
function gobbleHex(str) { | |
const lower = str.toLowerCase(); | |
let hex = ''; | |
let spaceTerminated = false; | |
for (let i = 0; i < 6 && lower[i] !== undefined; i++) { | |
const code = lower.charCodeAt(i); | |
// check to see if we are dealing with a valid hex char [a-f|0-9] | |
const valid = (code >= 97 && code <= 102) || (code >= 48 && code <= 57); | |
// https://drafts.csswg.org/css-syntax/#consume-escaped-code-point | |
spaceTerminated = code === 32; | |
if (!valid) { | |
break; | |
} | |
hex += lower[i]; | |
} | |
if (hex.length === 0) { | |
return undefined; | |
} | |
const codePoint = parseInt(hex, 16); | |
const isSurrogate = codePoint >= 0xD800 && codePoint <= 0xDFFF; | |
// Add special case for | |
// "If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point" | |
// https://drafts.csswg.org/css-syntax/#maximum-allowed-code-point | |
if (isSurrogate || codePoint === 0x0000 || codePoint > 0x10FFFF) { | |
return ['\uFFFD', hex.length + (spaceTerminated ? 1 : 0)]; | |
} | |
return [ | |
String.fromCodePoint(codePoint), | |
hex.length + (spaceTerminated ? 1 : 0), | |
]; | |
} | |
const CONTAINS_ESCAPE = /\\/; | |
function unescFast(str) { | |
let needToProcess = CONTAINS_ESCAPE.test(str); | |
if (!needToProcess) { | |
return str; | |
} | |
let ret = ""; | |
for (let i = 0; i < str.length; i++) { | |
if ((str[i] === "\\")) { | |
const gobbled = gobbleHex(str.slice(i+1, i+7)); | |
if (gobbled !== undefined) { | |
ret += gobbled[0]; | |
i += gobbled[1]; | |
continue; | |
} | |
// Retain a pair of \\ if double escaped `\\\\` | |
// https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e | |
if (str[i +1] === "\\") { | |
ret += "\\"; | |
i++; | |
continue; | |
} | |
// if \\ is at the end of the string retain it | |
// https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb | |
if (str.length === i + 1) { | |
ret += str[i]; | |
} | |
continue; | |
} | |
ret += str[i]; | |
} | |
return ret; | |
} | |
// run test first | |
const inputStrings = [ | |
'#foo', | |
'#w\\+', | |
'#foo\\', | |
'#wow\\\\k', | |
'.\\31 23', | |
'.\\🐐', | |
'.\\E9motion', | |
'.\\E9 dition', | |
'.\\0000E9dition', | |
'.\\1D306', | |
'.\\1D306k', | |
]; | |
const assertEqual = (a, b) => { | |
if (a !== b) throw new Error(`${a} !== ${b}`); | |
} | |
const test = () => inputStrings.forEach(str => assertEqual( | |
unescOrig(str), | |
unescNew(str), | |
)); | |
test(); | |
function time(taskName, task, runs = 10 ** 6) { | |
const start = Date.now(); | |
for (let i = 0; i < runs; i += 1) { | |
task(); | |
} | |
const total = Date.now() - start; | |
console.log(taskName, 'Time taken =', total, 'ms'); | |
return total; | |
} | |
const origTime = time('unescOrig()', () => inputStrings.forEach(str => unescOrig(str))); | |
const newTime = time('unescNew()', () => inputStrings.forEach(str => unescNew(str))); | |
const fastTime = time('unescFast()', () => inputStrings.forEach(str => unescFast(str))); | |
console.log('unescNew() / unescOrig() ratio =', newTime / origTime); | |
console.log('unescFast() / unescOrig() ratio =', fastTime / origTime); | |
// output on node 14.16.1 (osx 10.14.6, macbook pro mid-2015) | |
// unescOrig() Time taken = 8285 ms | |
// unescNew() Time taken = 4314 ms | |
// unescFast() Time taken = 2349 ms | |
// unescNew() / unescOrig() ratio = 0.5207000603500301 | |
// unescFast() / unescOrig() ratio = 0.28352444176222086 | |
// tested on node v14 and v12 on osx and ubuntu, and results are similar. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment