rasky · May 5, 2015 17:01
diff --git a/gistfile1.diff b/gistfile1.diff
 commit ef0d4a1d8ca5a809eb79e9203d4121107fc4e2c6
 Author: Giovanni Bajo <rasky@develer.com>
 Date:   Sat Dec 13 04:11:56 2014 +0100

    Fix handling of emojii in msgpacks (handle surrogate pairs)

 diff --git a/src/msgpack/msgpack.js b/src/msgpack/msgpack.js
 index b30f2c4..7a91b55 100644
 --- a/src/msgpack/msgpack.js
 +++ b/src/msgpack/msgpack.js
 @@ -34,12 +34,46 @@ function inspect(buffer) {
   return "<" + type + " " + bytes.join(" ") + ">";
 }
 
 +
 +// Fixed version of charCodeAt to skip surrogate pairs, to be used in for loop.
 +// Taken from: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt
 +function fixedCharCodeAt(str, idx) {
 +  // ex. fixedCharCodeAt('\uD800\uDC00', 0); // 65536
 +  // ex. fixedCharCodeAt('\uD800\uDC00', 1); // false
 +  idx = idx || 0;
 +  var code = str.charCodeAt(idx);
 +  var hi, low;
 +
 +  // High surrogate (could change last hex to 0xDB7F to treat high
 +  // private surrogates as single characters)
 +  if (0xD800 <= code && code <= 0xDBFF) {
 +    hi = code;
 +    low = str.charCodeAt(idx + 1);
 +    if (isNaN(low)) {
 +      throw 'High surrogate not followed by low surrogate in fixedCharCodeAt()';
 +    }
 +    return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
 +  }
 +  if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
 +    // We return false to allow loops to skip this iteration since should have
 +    // already handled high surrogate above in the previous iteration
 +    return false;
 +    /*hi = str.charCodeAt(idx - 1);
 +    low = code;
 +    return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;*/
 +  }
 +  return code;
 +}
 +
 // Encode string as utf8 into dataview at offset
 exports.utf8Write = utf8Write;
 function utf8Write(view, offset, string) {
   var byteLength = view.byteLength;
   for(var i = 0, l = string.length; i < l; i++) {
 -    var codePoint = string.charCodeAt(i);
 +    var codePoint = fixedCharCodeAt(string, i);
 +    if (codePoint === false) {
 +      continue;
 +    }
 
     // One byte of UTF-8
     if (codePoint < 0x80) {
 @@ -54,7 +88,7 @@ function utf8Write(view, offset, string) {
       continue;
     }
 
 -    // Three bytes of UTF-8.  
 +    // Three bytes of UTF-8.
     if (codePoint < 0x10000) {
       view.setUint8(offset++, codePoint >>> 12 & 0x0f | 0xe0);
       view.setUint8(offset++, codePoint >>> 6  & 0x3f | 0x80);
 @@ -74,6 +108,21 @@ function utf8Write(view, offset, string) {
   }
 }
 
 +// ES6 polyfills for handling unicode codepoints without surrogate pairs
 +// Taken from: http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp/3759300#3759300
 +String.prototype.getCodePointLength= function() {
 +    return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
 +};
 +String.fromCodePoint= function() {
 +    var chars= Array.prototype.slice.call(arguments);
 +    for (var i= chars.length; i-->0;) {
 +        var n = chars[i]-0x10000;
 +        if (n>=0)
 +            chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
 +    }
 +    return String.fromCharCode.apply(null, chars);
 +};
 +
 exports.utf8Read = utf8Read;
 function utf8Read(view, offset, length) {
   var string = "";
 @@ -81,20 +130,20 @@ function utf8Read(view, offset, length) {
     var byte = view.getUint8(i);
     // One byte character
     if ((byte & 0x80) === 0x00) {
 -      string += String.fromCharCode(byte);
 +      string += String.fromCodePoint(byte);
       continue;
     }
     // Two byte character
     if ((byte & 0xe0) === 0xc0) {
 -      string += String.fromCharCode(
 -        ((byte & 0x0f) << 6) | 
 +      string += String.fromCodePoint(
 +        ((byte & 0x1f) << 6) |
         (view.getUint8(++i) & 0x3f)
       );
       continue;
     }
     // Three byte character
     if ((byte & 0xf0) === 0xe0) {
 -      string += String.fromCharCode(
 +      string += String.fromCodePoint(
         ((byte & 0x0f) << 12) |
         ((view.getUint8(++i) & 0x3f) << 6) |
         ((view.getUint8(++i) & 0x3f) << 0)
 @@ -103,7 +152,7 @@ function utf8Read(view, offset, length) {
     }
     // Four byte character
     if ((byte & 0xf8) === 0xf0) {
 -      string += String.fromCharCode(
 +      string += String.fromCodePoint(
         ((byte & 0x07) << 18) |
         ((view.getUint8(++i) & 0x3f) << 12) |
         ((view.getUint8(++i) & 0x3f) << 6) |
 @@ -120,7 +169,10 @@ exports.utf8ByteCount = utf8ByteCount;
 function utf8ByteCount(string) {
   var count = 0;
   for(var i = 0, l = string.length; i < l; i++) {
 -    var codePoint = string.charCodeAt(i);
 +    var codePoint = fixedCharCodeAt(string, i);
 +    if (codePoint === false) {  // other char in surrogate pair, skip
 +      continue;
 +    }
     if (codePoint < 0x80) {
       count += 1;
       continue;
	commit ef0d4a1d8ca5a809eb79e9203d4121107fc4e2c6
	Author: Giovanni Bajo <rasky@develer.com>
	Date: Sat Dec 13 04:11:56 2014 +0100

	Fix handling of emojii in msgpacks (handle surrogate pairs)

	diff --git a/src/msgpack/msgpack.js b/src/msgpack/msgpack.js
	index b30f2c4..7a91b55 100644
	--- a/src/msgpack/msgpack.js
	+++ b/src/msgpack/msgpack.js
	@@ -34,12 +34,46 @@ function inspect(buffer) {
	return "<" + type + " " + bytes.join(" ") + ">";
	}

	+
	+// Fixed version of charCodeAt to skip surrogate pairs, to be used in for loop.
	+// Taken from: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/charCodeAt
	+function fixedCharCodeAt(str, idx) {
	+ // ex. fixedCharCodeAt('\uD800\uDC00', 0); // 65536
	+ // ex. fixedCharCodeAt('\uD800\uDC00', 1); // false
	+ idx = idx \|\| 0;
	+ var code = str.charCodeAt(idx);
	+ var hi, low;
	+
	+ // High surrogate (could change last hex to 0xDB7F to treat high
	+ // private surrogates as single characters)
	+ if (0xD800 <= code && code <= 0xDBFF) {
	+ hi = code;
	+ low = str.charCodeAt(idx + 1);
	+ if (isNaN(low)) {
	+ throw 'High surrogate not followed by low surrogate in fixedCharCodeAt()';
	+ }
	+ return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
	+ }
	+ if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
	+ // We return false to allow loops to skip this iteration since should have
	+ // already handled high surrogate above in the previous iteration
	+ return false;
	+ /*hi = str.charCodeAt(idx - 1);
	+ low = code;
	+ return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;*/
	+ }
	+ return code;
	+}
	+
	// Encode string as utf8 into dataview at offset
	exports.utf8Write = utf8Write;
	function utf8Write(view, offset, string) {
	var byteLength = view.byteLength;
	for(var i = 0, l = string.length; i < l; i++) {
	- var codePoint = string.charCodeAt(i);
	+ var codePoint = fixedCharCodeAt(string, i);
	+ if (codePoint === false) {
	+ continue;
	+ }

	// One byte of UTF-8
	if (codePoint < 0x80) {
	@@ -54,7 +88,7 @@ function utf8Write(view, offset, string) {
	continue;
	}

	- // Three bytes of UTF-8.
	+ // Three bytes of UTF-8.
	if (codePoint < 0x10000) {
	view.setUint8(offset++, codePoint >>> 12 & 0x0f \| 0xe0);
	view.setUint8(offset++, codePoint >>> 6 & 0x3f \| 0x80);
	@@ -74,6 +108,21 @@ function utf8Write(view, offset, string) {
	}
	}

	+// ES6 polyfills for handling unicode codepoints without surrogate pairs
	+// Taken from: http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp/3759300#3759300
	+String.prototype.getCodePointLength= function() {
	+ return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
	+};
	+String.fromCodePoint= function() {
	+ var chars= Array.prototype.slice.call(arguments);
	+ for (var i= chars.length; i-->0;) {
	+ var n = chars[i]-0x10000;
	+ if (n>=0)
	+ chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
	+ }
	+ return String.fromCharCode.apply(null, chars);
	+};
	+
	exports.utf8Read = utf8Read;
	function utf8Read(view, offset, length) {
	var string = "";
	@@ -81,20 +130,20 @@ function utf8Read(view, offset, length) {
	var byte = view.getUint8(i);
	// One byte character
	if ((byte & 0x80) === 0x00) {
	- string += String.fromCharCode(byte);
	+ string += String.fromCodePoint(byte);
	continue;
	}
	// Two byte character
	if ((byte & 0xe0) === 0xc0) {
	- string += String.fromCharCode(
	- ((byte & 0x0f) << 6) \|
	+ string += String.fromCodePoint(
	+ ((byte & 0x1f) << 6) \|
	(view.getUint8(++i) & 0x3f)
	);
	continue;
	}
	// Three byte character
	if ((byte & 0xf0) === 0xe0) {
	- string += String.fromCharCode(
	+ string += String.fromCodePoint(
	((byte & 0x0f) << 12) \|
	((view.getUint8(++i) & 0x3f) << 6) \|
	((view.getUint8(++i) & 0x3f) << 0)
	@@ -103,7 +152,7 @@ function utf8Read(view, offset, length) {
	}
	// Four byte character
	if ((byte & 0xf8) === 0xf0) {
	- string += String.fromCharCode(
	+ string += String.fromCodePoint(
	((byte & 0x07) << 18) \|
	((view.getUint8(++i) & 0x3f) << 12) \|
	((view.getUint8(++i) & 0x3f) << 6) \|
	@@ -120,7 +169,10 @@ exports.utf8ByteCount = utf8ByteCount;
	function utf8ByteCount(string) {
	var count = 0;
	for(var i = 0, l = string.length; i < l; i++) {
	- var codePoint = string.charCodeAt(i);
	+ var codePoint = fixedCharCodeAt(string, i);
	+ if (codePoint === false) { // other char in surrogate pair, skip
	+ continue;
	+ }
	if (codePoint < 0x80) {
	count += 1;
	continue;
No results found