mathiasbynens/unicodeEscape.js

mathiasbynens · 2011-09-29T09:54:39Z

We could save some bytes in the output by using escapes of the form \xab instead of \u1234. The code is more compact, too:

// based on @kitcambridge’s earlier ES3-compatible example
function unicodeEscape(str) {
  return str.replace(/[\s\S]/g, function(character) {
    return '\\x' + ('0' + character.charCodeAt().toString(16)).slice(-2);
  });
}

Of course, that wouldn’t work for characters whose code point has a hexadecimal value of more than two digits (e.g. ⌘). But we could look into merging these two approaches to create the shortest possible output string.

Something like this:

function unicodeEscape(str) {
  return str.replace(/[\s\S]/g, function(character) {
    var escape = character.charCodeAt().toString(16),
        longhand = escape.length > 2;
    return '\\' + (longhand ? 'u' : 'x') + ('0000' + escape).slice(longhand ? -4 : -2);
  });
}

Demo: http://mothereff.in/js-escapes

ghost · 2011-09-30T05:51:44Z

function unicodeEscape(str) {
  var result = '', index = 0, charCode, escape;
  while (!isNaN(charCode = str.charCodeAt(index++))) {
    escape = charCode.toString(16);
    result += charCode < 256
      ? '\\x' + (charCode > 15 ? '' : '0') + escape
      : '\\u' + ('0000' + escape).slice(-4);
  }
  return result;
}

A more efficient padding algorithm for two-digit hexadecimal values.

mathiasbynens · 2011-09-30T05:59:42Z

@kitcambridge Hawt. Keep the good stuff coming!

mathiasbynens · 2011-10-23T16:17:18Z

@kitcambridge It just “dawned on me” that we could use [^] instead of [\s\S] if IE < 9 support is not an issue. Performance tests here: http://jsperf.com/match-any-char-regex

mathiasbynens · 2011-11-29T08:22:45Z

Okay, so we use Unicode escapes (e.g. \u1234) and hexadecimal escapes (e.g. \x12)… What about octal escapes (e.g. \123)?

I quickly tested this in Node.js:

(function() {
    var str = '',
        charCode,
        escape1,
        escape2,
        hexadecimal,
        octal;
    for (charCode = 0; charCode <= 65535; charCode++) {
        hexadecimal = charCode.toString(16);
        octal = charCode.toString(8);
        escape1 = charCode < 256
            ? '\\x' + (charCode > 15 ? '' : '0') + hexadecimal
            : '\\u' + ('0000' + hexadecimal).slice(-4);
        escape2 = octal < 378 ? '\\' + octal : false;
        // log all characters for which octal escapes are the shortest solution
        if (escape2 && escape2.length < escape1.length) {
            console.log(charCode, String.fromCharCode(charCode), escape1, escape2);
        }
    }
}());

Octal escapes can only be used for charCodes smaller than 256, and the test results show that they’re only shorter than Unicode/hex escapes for charCodes < 64:

 0  '\u0000'    '\\x00' '\\0'
 1  '\u0001'    '\\x01' '\\1'
 2  '\u0002'    '\\x02' '\\2'
 3  '\u0003'    '\\x03' '\\3'
 4  '\u0004'    '\\x04' '\\4'
 5  '\u0005'    '\\x05' '\\5'
 6  '\u0006'    '\\x06' '\\6'
 7  '\u0007'    '\\x07' '\\7'
 8  '\b'        '\\x08' '\\10'
 9  '\t'        '\\x09' '\\11'
10  '\n'        '\\x0a' '\\12'
11  '\u000b'    '\\x0b' '\\13'
12  '\f'        '\\x0c' '\\14'
13  '\r'        '\\x0d' '\\15'
14  '\u000e'    '\\x0e' '\\16'
15  '\u000f'    '\\x0f' '\\17'
16  '\u0010'    '\\x10' '\\20'
17  '\u0011'    '\\x11' '\\21'
18  '\u0012'    '\\x12' '\\22'
19  '\u0013'    '\\x13' '\\23'
20  '\u0014'    '\\x14' '\\24'
21  '\u0015'    '\\x15' '\\25'
22  '\u0016'    '\\x16' '\\26'
23  '\u0017'    '\\x17' '\\27'
24  '\u0018'    '\\x18' '\\30'
25  '\u0019'    '\\x19' '\\31'
26  '\u001a'    '\\x1a' '\\32'
27  '\u001b'    '\\x1b' '\\33'
28  '\u001c'    '\\x1c' '\\34'
29  '\u001d'    '\\x1d' '\\35'
30  '\u001e'    '\\x1e' '\\36'
31  '\u001f'    '\\x1f' '\\37'
32  ' '         '\\x20' '\\40'
33  '!'         '\\x21' '\\41'
34  '"'         '\\x22' '\\42'
35  '#'         '\\x23' '\\43'
36  '$'         '\\x24' '\\44'
37  '%'         '\\x25' '\\45'
38  '&'         '\\x26' '\\46'
39  '\'         '\\x27' '\\47'
40  '('         '\\x28' '\\50'
41  ')'         '\\x29' '\\51'
42  '*'         '\\x2a' '\\52'
43  '+'         '\\x2b' '\\53'
44  ','         '\\x2c' '\\54'
45  '-'         '\\x2d' '\\55'
46  '.'         '\\x2e' '\\56'
47  '/'         '\\x2f' '\\57'
48  '0'         '\\x30' '\\60'
49  '1'         '\\x31' '\\61'
50  '2'         '\\x32' '\\62'
51  '3'         '\\x33' '\\63'
52  '4'         '\\x34' '\\64'
53  '5'         '\\x35' '\\65'
54  '6'         '\\x36' '\\66'
55  '7'         '\\x37' '\\67'
56  '8'         '\\x38' '\\70'
57  '9'         '\\x39' '\\71'
58  ':'         '\\x3a' '\\72'
59  ';'         '\\x3b' '\\73'
60  '<'         '\\x3c' '\\74'
61  '='         '\\x3d' '\\75'
62  '>'         '\\x3e' '\\76'
63  '?'         '\\x3f' '\\77'

Of course, it’s problematic if you have e.g. '\0' immediately followed by another digit, e.g. 1, as it will alter the escape rather than append a new character:

'\0' == '\x00' // true
'\01' == '\x001' // false

Update: We probably shouldn’t use them:

Past editions of ECMAScript have included additional syntax and semantics for specifying octal literals and octal escape sequences. These have been removed from this edition of ECMAScript. This non-normative annex presents uniform syntax and semantics for octal literals and octal escape sequences for compatibility with some older ECMAScript programs.

ghost · 2011-11-29T16:35:52Z

@mathiasbynens Yes, it's best to avoid octal escape sequences...the OctalEscapeSequence production is deprecated in ES 5, and produces a syntax error in strict mode:

A conforming implementation, when processing strict mode code (see 10.1.1), may not extend the syntax of EscapeSequence to include OctalEscapeSequence as described in B.1.2. —Annex C

brandonros · 2016-03-23T15:50:49Z

I'm throwing this up here hoping to help somebody else down the road.

I had to restore partial keys from a Redis dump, and this function almost helped. Here is what I came up with.

Make sure to create the redis client with like this:

var client = redis.createClient(global['redis_port'], global['redis_host'], { return_buffers: true });

var fs = require('fs');

var redis = require('../lib/redis.js');

function e(buf) {
    var res = '';

    for (var i = 0; i < Buffer.byteLength(buf); ++i) {
        var c = buf[i].toString(16);
        if (c.length == 1) {
            c = '0' + c;
        }

        res += '\\x' + c;
    }

    return res;
}

function generate_dump() {
    var keys = fs.readFileSync('keys.txt').toString().split('\n');

    return keys.reduce(function (prev, key) {
        return prev.then(function () {
            return redis.dump(key)
                .then(function (res) {
                    if (!res) {
                        console.log('missing key', key);

                        return;
                    }

                    fs.appendFileSync('dump.txt', 'RESTORE ' + key + ' 0 "' + e(res) + '"\n');
                });
        });
    }, Promise.resolve());
}

redis.init()
.then(function () {
    return generate_dump();
})
.then(function () {
    console.log('done');
})
.catch(function (err) {
    console.log(err['stack']);
});

adamvleggett · 2016-11-12T00:19:23Z

If the goal is to do this with minimal code size, the following works well and minifies to ~100 bytes:

function escapeUnicode(str) {
    return str.replace(/[^\0-~]/g, function(ch) {
        return "\\u" + ("000" + ch.charCodeAt().toString(16)).slice(-4);
    });
}

F1LT3R · 2016-12-15T17:17:15Z

Fantastic! Thanks for this @mathiasbynens!

mervick · 2018-11-13T17:11:12Z

Replace only unicode characters

function escapeUnicode(str) {
  return str.replace(/[\u00A0-\uffff]/gu, function (c) {
    return "\\u" + ("000" + c.charCodeAt().toString(16)).slice(-4)
  });
}

I use this for convert utf8 content of js files to latin1

rafaelvanat · 2019-12-18T19:17:03Z

Very interesting work guys, thanks for sharing.
@mervick was especially useful for my use case, any restriction to use it? Thanks!

mervick · 2019-12-19T03:10:15Z

@rafaelvanat I used that in my project more then year, and so far there have been no problems

josephrocca · 2020-06-18T11:21:03Z

@mervick @rafaelvanat If I use that function like this:

escapeUnicode("abc𝔸𝔹ℂ")

Then I get:

abc𝔸𝔹\u2102

The following function fixes this by matching all non-ASCII characters after splitting the string in a "unicode-safe" way (using [...str]). It then splits each Unicode character up into its code-points, and gets the escape code for each (rather than just grabbing the first char code of each Unicode character):

function escapeUnicode(str) {
  return [...str].map(c => /^[\x00-\x7F]$/.test(c) ? c : c.split("").map(a => "\\u" + a.charCodeAt().toString(16).padStart(4, "0")).join("")).join("");
}

This gives the correct result:

abc\ud835\udd38\ud835\udd39\u2102

This seems to work fine in all my tests so far, but if I find any bugs I'll add fixes in this gist. Performance doesn't matter for my use-case, so I haven't benchmarked or optimised it at all.

mathiasbynens · 2020-06-18T17:01:46Z

Check out jsesc which solves this problem in a more robust manner.

josephrocca · 2020-06-19T06:40:39Z

@mathiasbynens It looks great! I did try to use it but unfortunately I'm not up to date with all the browserify/bundling stuff and just need a vanilla JS script (e.g. no use of Buffer) to include in a module import and wasn't able to work out how to do that with jsesc (though I admit I only poked around for a few minutes before deciding to write the function above). Also, out of pure curiosity I'd be interested in cases where the above function fails - I couldn't find any failing cases in my tests.

mathiasbynens · 2020-06-19T18:23:21Z

@josephrocca See https://github.com/mathiasbynens/jsesc#support. TL:DR use v1.3.0.

mathiasbynens/unicodeEscape.js

mathiasbynens commented Sep 29, 2011

Uh oh!

ghost commented Sep 30, 2011

Uh oh!

mathiasbynens commented Sep 30, 2011

Uh oh!

mathiasbynens commented Oct 23, 2011

Uh oh!

mathiasbynens commented Nov 29, 2011 •

edited

Loading

Uh oh!

ghost commented Nov 29, 2011

Uh oh!

brandonros commented Mar 23, 2016

Uh oh!

adamvleggett commented Nov 12, 2016

Uh oh!

F1LT3R commented Dec 15, 2016 •

edited

Loading

Uh oh!

mervick commented Nov 13, 2018

Uh oh!

rafaelvanat commented Dec 18, 2019

Uh oh!

mervick commented Dec 19, 2019

Uh oh!

josephrocca commented Jun 18, 2020 •

edited

Loading

Uh oh!

mathiasbynens commented Jun 18, 2020

Uh oh!

josephrocca commented Jun 19, 2020 •

edited

Loading

Uh oh!

mathiasbynens commented Jun 19, 2020

Uh oh!

	// Ever needed to escape '\n' as '\\n'? This function does that for any character,
	// using hex and/or Unicode escape sequences (whichever are shortest).
	// Demo: http://mothereff.in/js-escapes
	function unicodeEscape(str) {
	return str.replace(/[\s\S]/g, function(character) {
	var escape = character.charCodeAt().toString(16),
	longhand = escape.length > 2;
	return '\\' + (longhand ? 'u' : 'x') + ('0000' + escape).slice(longhand ? -4 : -2);
	});
	}

mathiasbynens/unicodeEscape.js

mathiasbynens commented Sep 29, 2011

Uh oh!

ghost commented Sep 30, 2011

Uh oh!

mathiasbynens commented Sep 30, 2011

Uh oh!

mathiasbynens commented Oct 23, 2011

Uh oh!

mathiasbynens commented Nov 29, 2011 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ghost commented Nov 29, 2011

Uh oh!

brandonros commented Mar 23, 2016

Uh oh!

adamvleggett commented Nov 12, 2016

Uh oh!

F1LT3R commented Dec 15, 2016 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

mervick commented Nov 13, 2018

Uh oh!

rafaelvanat commented Dec 18, 2019

Uh oh!

mervick commented Dec 19, 2019

Uh oh!

josephrocca commented Jun 18, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

mathiasbynens commented Jun 18, 2020

Uh oh!

josephrocca commented Jun 19, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

mathiasbynens commented Jun 19, 2020

Uh oh!

mathiasbynens commented Nov 29, 2011 •

edited

Loading

F1LT3R commented Dec 15, 2016 •

edited

Loading

josephrocca commented Jun 18, 2020 •

edited

Loading

josephrocca commented Jun 19, 2020 •

edited

Loading