Created
May 5, 2015 17:01
-
-
Save rasky/b33ad065c79054fb9cd7 to your computer and use it in GitHub Desktop.
Fix msgpack-js-browser for emojii and other utf-8 decoding/encoding bugs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| commit ef0d4a1d8ca5a809eb79e9203d4121107fc4e2c6 | |
| Author: Giovanni Bajo <rasky@develer.com> | |
| Date: Sat Dec 13 04:11:56 2014 +0100 | |
| Fix handling of emojii in msgpacks (handle surrogate pairs) | |
| diff --git a/src/msgpack/msgpack.js b/src/msgpack/msgpack.js | |
| index b30f2c4..7a91b55 100644 | |
| --- a/src/msgpack/msgpack.js | |
| +++ b/src/msgpack/msgpack.js | |
| @@ -34,12 +34,46 @@ function inspect(buffer) { | |
| return "<" + type + " " + bytes.join(" ") + ">"; | |
| } | |
| + | |
| +// Fixed version of charCodeAt to skip surrogate pairs, to be used in for loop. | |
| +// Taken from: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt | |
| +function fixedCharCodeAt(str, idx) { | |
| + // ex. fixedCharCodeAt('\uD800\uDC00', 0); // 65536 | |
| + // ex. fixedCharCodeAt('\uD800\uDC00', 1); // false | |
| + idx = idx || 0; | |
| + var code = str.charCodeAt(idx); | |
| + var hi, low; | |
| + | |
| + // High surrogate (could change last hex to 0xDB7F to treat high | |
| + // private surrogates as single characters) | |
| + if (0xD800 <= code && code <= 0xDBFF) { | |
| + hi = code; | |
| + low = str.charCodeAt(idx + 1); | |
| + if (isNaN(low)) { | |
| + throw 'High surrogate not followed by low surrogate in fixedCharCodeAt()'; | |
| + } | |
| + return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000; | |
| + } | |
| + if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate | |
| + // We return false to allow loops to skip this iteration since should have | |
| + // already handled high surrogate above in the previous iteration | |
| + return false; | |
| + /*hi = str.charCodeAt(idx - 1); | |
| + low = code; | |
| + return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;*/ | |
| + } | |
| + return code; | |
| +} | |
| + | |
| // Encode string as utf8 into dataview at offset | |
| exports.utf8Write = utf8Write; | |
| function utf8Write(view, offset, string) { | |
| var byteLength = view.byteLength; | |
| for(var i = 0, l = string.length; i < l; i++) { | |
| - var codePoint = string.charCodeAt(i); | |
| + var codePoint = fixedCharCodeAt(string, i); | |
| + if (codePoint === false) { | |
| + continue; | |
| + } | |
| // One byte of UTF-8 | |
| if (codePoint < 0x80) { | |
| @@ -54,7 +88,7 @@ function utf8Write(view, offset, string) { | |
| continue; | |
| } | |
| - // Three bytes of UTF-8. | |
| + // Three bytes of UTF-8. | |
| if (codePoint < 0x10000) { | |
| view.setUint8(offset++, codePoint >>> 12 & 0x0f | 0xe0); | |
| view.setUint8(offset++, codePoint >>> 6 & 0x3f | 0x80); | |
| @@ -74,6 +108,21 @@ function utf8Write(view, offset, string) { | |
| } | |
| } | |
| +// ES6 polyfills for handling unicode codepoints without surrogate pairs | |
| +// Taken from: http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp/3759300#3759300 | |
| +String.prototype.getCodePointLength= function() { | |
| + return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1; | |
| +}; | |
| +String.fromCodePoint= function() { | |
| + var chars= Array.prototype.slice.call(arguments); | |
| + for (var i= chars.length; i-->0;) { | |
| + var n = chars[i]-0x10000; | |
| + if (n>=0) | |
| + chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF)); | |
| + } | |
| + return String.fromCharCode.apply(null, chars); | |
| +}; | |
| + | |
| exports.utf8Read = utf8Read; | |
| function utf8Read(view, offset, length) { | |
| var string = ""; | |
| @@ -81,20 +130,20 @@ function utf8Read(view, offset, length) { | |
| var byte = view.getUint8(i); | |
| // One byte character | |
| if ((byte & 0x80) === 0x00) { | |
| - string += String.fromCharCode(byte); | |
| + string += String.fromCodePoint(byte); | |
| continue; | |
| } | |
| // Two byte character | |
| if ((byte & 0xe0) === 0xc0) { | |
| - string += String.fromCharCode( | |
| - ((byte & 0x0f) << 6) | | |
| + string += String.fromCodePoint( | |
| + ((byte & 0x1f) << 6) | | |
| (view.getUint8(++i) & 0x3f) | |
| ); | |
| continue; | |
| } | |
| // Three byte character | |
| if ((byte & 0xf0) === 0xe0) { | |
| - string += String.fromCharCode( | |
| + string += String.fromCodePoint( | |
| ((byte & 0x0f) << 12) | | |
| ((view.getUint8(++i) & 0x3f) << 6) | | |
| ((view.getUint8(++i) & 0x3f) << 0) | |
| @@ -103,7 +152,7 @@ function utf8Read(view, offset, length) { | |
| } | |
| // Four byte character | |
| if ((byte & 0xf8) === 0xf0) { | |
| - string += String.fromCharCode( | |
| + string += String.fromCodePoint( | |
| ((byte & 0x07) << 18) | | |
| ((view.getUint8(++i) & 0x3f) << 12) | | |
| ((view.getUint8(++i) & 0x3f) << 6) | | |
| @@ -120,7 +169,10 @@ exports.utf8ByteCount = utf8ByteCount; | |
| function utf8ByteCount(string) { | |
| var count = 0; | |
| for(var i = 0, l = string.length; i < l; i++) { | |
| - var codePoint = string.charCodeAt(i); | |
| + var codePoint = fixedCharCodeAt(string, i); | |
| + if (codePoint === false) { // other char in surrogate pair, skip | |
| + continue; | |
| + } | |
| if (codePoint < 0x80) { | |
| count += 1; | |
| continue; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment