slevithan · May 7, 2012 20:54 · slevithan · May 7, 2012
diff --git a/xregexp-unicode-codepoints.js b/xregexp-unicode-codepoints.js
 // Allow syntax extensions
 XRegExp.install("extensibility");

 /* Adds Unicode code point syntax to XRegExp: \u{n..}
 * `n..` is any 1-6 digit hexadecimal number from 0-10FFFF. Comes from ES6 proposals. Code points
 * above U+FFFF are converted to surrogate pairs, so e.g. `\u{20B20}` is simply an alternate syntax
 * for `\uD842\uDF20`. This can lead to broken behavior if you follow a `\u{n..}` token that
 * references a code point above U+FFFF with a quantifier, or if you use the same in a character
 * class. Using `\u{n..}` with code points above U+FFFF is therefore not recommended, unless you
 * know exactly what you're doing. XRegExp's handling follows ES6 proposals for `\u{n..}`, since
 * compatibility concerns prevent JavaScript regexes from changing to be based on code points
 * rather than code units by default.
 */
 XRegExp.addToken(
    /\\u{([0-9A-Fa-f]{1,6})}/,
    (function () {
        function pad4(s) {while (s.length < 4) s = "0" + s; return s;}
        function dec(hex) {return parseInt(hex, 16);}
        function hex(dec) {return parseInt(dec, 10).toString(16);}
        return function (match) {
            var code = dec(match[1]), offset;
            if (code > 0x10FFFF) {
                throw new SyntaxError("invalid Unicode code point " + match[0]);
            }
            if (code <= 0xFFFF) {
                // Converting to \uNNNN avoids needing to escape the character and keep it separate
                // from preceding tokens
                return "\\u" + pad4(hex(code));
            }
            offset = code - 0x10000;
            return "\\u" + pad4(hex(0xD800 + (offset >> 10))) + "\\u" + pad4(hex(0xDC00 + (offset & 0x3FF)));
        };
    }()),
    {scope: "all"}
 );
	// Allow syntax extensions
	XRegExp.install("extensibility");

	/* Adds Unicode code point syntax to XRegExp: \u{n..}
	* `n..` is any 1-6 digit hexadecimal number from 0-10FFFF. Comes from ES6 proposals. Code points
	* above U+FFFF are converted to surrogate pairs, so e.g. `\u{20B20}` is simply an alternate syntax
	* for `\uD842\uDF20`. This can lead to broken behavior if you follow a `\u{n..}` token that
	* references a code point above U+FFFF with a quantifier, or if you use the same in a character
	* class. Using `\u{n..}` with code points above U+FFFF is therefore not recommended, unless you
	* know exactly what you're doing. XRegExp's handling follows ES6 proposals for `\u{n..}`, since
	* compatibility concerns prevent JavaScript regexes from changing to be based on code points
	* rather than code units by default.
	*/
	XRegExp.addToken(
	/\\u{([0-9A-Fa-f]{1,6})}/,
	(function () {
	function pad4(s) {while (s.length < 4) s = "0" + s; return s;}
	function dec(hex) {return parseInt(hex, 16);}
	function hex(dec) {return parseInt(dec, 10).toString(16);}
	return function (match) {
	var code = dec(match[1]), offset;
	if (code > 0x10FFFF) {
	throw new SyntaxError("invalid Unicode code point " + match[0]);
	}
	if (code <= 0xFFFF) {
	// Converting to \uNNNN avoids needing to escape the character and keep it separate
	// from preceding tokens
	return "\\u" + pad4(hex(code));
	}
	offset = code - 0x10000;
	return "\\u" + pad4(hex(0xD800 + (offset >> 10))) + "\\u" + pad4(hex(0xDC00 + (offset & 0x3FF)));
	};
	}()),
	{scope: "all"}
	);