Skip to content

Instantly share code, notes, and snippets.

@rasky
Created May 5, 2015 17:01
Show Gist options
  • Select an option

  • Save rasky/b33ad065c79054fb9cd7 to your computer and use it in GitHub Desktop.

Select an option

Save rasky/b33ad065c79054fb9cd7 to your computer and use it in GitHub Desktop.
Fix msgpack-js-browser for emojii and other utf-8 decoding/encoding bugs
commit ef0d4a1d8ca5a809eb79e9203d4121107fc4e2c6
Author: Giovanni Bajo <rasky@develer.com>
Date: Sat Dec 13 04:11:56 2014 +0100
Fix handling of emojii in msgpacks (handle surrogate pairs)
diff --git a/src/msgpack/msgpack.js b/src/msgpack/msgpack.js
index b30f2c4..7a91b55 100644
--- a/src/msgpack/msgpack.js
+++ b/src/msgpack/msgpack.js
@@ -34,12 +34,46 @@ function inspect(buffer) {
return "<" + type + " " + bytes.join(" ") + ">";
}
+
+// Fixed version of charCodeAt to skip surrogate pairs, to be used in for loop.
+// Taken from: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt
+function fixedCharCodeAt(str, idx) {
+ // ex. fixedCharCodeAt('\uD800\uDC00', 0); // 65536
+ // ex. fixedCharCodeAt('\uD800\uDC00', 1); // false
+ idx = idx || 0;
+ var code = str.charCodeAt(idx);
+ var hi, low;
+
+ // High surrogate (could change last hex to 0xDB7F to treat high
+ // private surrogates as single characters)
+ if (0xD800 <= code && code <= 0xDBFF) {
+ hi = code;
+ low = str.charCodeAt(idx + 1);
+ if (isNaN(low)) {
+ throw 'High surrogate not followed by low surrogate in fixedCharCodeAt()';
+ }
+ return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
+ }
+ if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
+ // We return false to allow loops to skip this iteration since should have
+ // already handled high surrogate above in the previous iteration
+ return false;
+ /*hi = str.charCodeAt(idx - 1);
+ low = code;
+ return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;*/
+ }
+ return code;
+}
+
// Encode string as utf8 into dataview at offset
exports.utf8Write = utf8Write;
function utf8Write(view, offset, string) {
var byteLength = view.byteLength;
for(var i = 0, l = string.length; i < l; i++) {
- var codePoint = string.charCodeAt(i);
+ var codePoint = fixedCharCodeAt(string, i);
+ if (codePoint === false) {
+ continue;
+ }
// One byte of UTF-8
if (codePoint < 0x80) {
@@ -54,7 +88,7 @@ function utf8Write(view, offset, string) {
continue;
}
- // Three bytes of UTF-8.
+ // Three bytes of UTF-8.
if (codePoint < 0x10000) {
view.setUint8(offset++, codePoint >>> 12 & 0x0f | 0xe0);
view.setUint8(offset++, codePoint >>> 6 & 0x3f | 0x80);
@@ -74,6 +108,21 @@ function utf8Write(view, offset, string) {
}
}
+// ES6 polyfills for handling unicode codepoints without surrogate pairs
+// Taken from: http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp/3759300#3759300
+String.prototype.getCodePointLength= function() {
+ return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
+};
+String.fromCodePoint= function() {
+ var chars= Array.prototype.slice.call(arguments);
+ for (var i= chars.length; i-->0;) {
+ var n = chars[i]-0x10000;
+ if (n>=0)
+ chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
+ }
+ return String.fromCharCode.apply(null, chars);
+};
+
exports.utf8Read = utf8Read;
function utf8Read(view, offset, length) {
var string = "";
@@ -81,20 +130,20 @@ function utf8Read(view, offset, length) {
var byte = view.getUint8(i);
// One byte character
if ((byte & 0x80) === 0x00) {
- string += String.fromCharCode(byte);
+ string += String.fromCodePoint(byte);
continue;
}
// Two byte character
if ((byte & 0xe0) === 0xc0) {
- string += String.fromCharCode(
- ((byte & 0x0f) << 6) |
+ string += String.fromCodePoint(
+ ((byte & 0x1f) << 6) |
(view.getUint8(++i) & 0x3f)
);
continue;
}
// Three byte character
if ((byte & 0xf0) === 0xe0) {
- string += String.fromCharCode(
+ string += String.fromCodePoint(
((byte & 0x0f) << 12) |
((view.getUint8(++i) & 0x3f) << 6) |
((view.getUint8(++i) & 0x3f) << 0)
@@ -103,7 +152,7 @@ function utf8Read(view, offset, length) {
}
// Four byte character
if ((byte & 0xf8) === 0xf0) {
- string += String.fromCharCode(
+ string += String.fromCodePoint(
((byte & 0x07) << 18) |
((view.getUint8(++i) & 0x3f) << 12) |
((view.getUint8(++i) & 0x3f) << 6) |
@@ -120,7 +169,10 @@ exports.utf8ByteCount = utf8ByteCount;
function utf8ByteCount(string) {
var count = 0;
for(var i = 0, l = string.length; i < l; i++) {
- var codePoint = string.charCodeAt(i);
+ var codePoint = fixedCharCodeAt(string, i);
+ if (codePoint === false) { // other char in surrogate pair, skip
+ continue;
+ }
if (codePoint < 0x80) {
count += 1;
continue;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment