Skip to content

Instantly share code, notes, and snippets.

@peteroupc
Last active August 29, 2015 14:02
Show Gist options
  • Save peteroupc/08c5ecc8131a76062ffe to your computer and use it in GitHub Desktop.
Save peteroupc/08c5ecc8131a76062ffe to your computer and use it in GitHub Desktop.
UTF-7 decoder in JavaScript
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.com/d/
*/
var CodeUnitAppender = function() {
this.surrogate = -1;
this.lastByte = -1;
};
(function(prototype, constr) {
constr.replacement = "\ufffd"
prototype.finalizeAndReset = function(builder) {
if (this.surrogate >= 0 && this.lastByte >= 0) {
// Unpaired surrogate and an unpaired byte value
builder.push(constr.replacement);
builder.push(constr.replacement);
} else if (this.surrogate >= 0 || this.lastByte >= 0) {
// Unpaired surrogate or byte value remains
builder.push(constr.replacement);
}
this.surrogate = -1;
this.lastByte = -1;
};
prototype.appendIncompleteByte = function() {
// Make sure lastByte isn't -1, for finalizeAndReset
// purposes
this.lastByte = 0;
};
prototype.appendByte = function(value, builder) {
if (this.lastByte >= 0) {
var codeunit = this.lastByte << 8;
codeunit |= value & 0xff;
this.appendCodeUnit(codeunit, builder);
this.lastByte = -1;
} else {
this.lastByte = value;
}
};
prototype.appendCodeUnit = function(codeunit, builder) {
if (this.surrogate >= 0) {
// If we have a surrogate, "codeunit"
// must be a valid "low surrogate" to complete the pair
if ((codeunit & 0xfc00) == 0xdc00) {
// valid low surrogate
builder.push(String.fromCharCode(this.surrogate));
builder.push(String.fromCharCode(codeunit));
this.surrogate = -1;
} else if ((codeunit & 0xfc00) == 0xd800) {
// unpaired high surrogate
builder.push(constr.replacement);
this.surrogate = codeunit;
} else {
// not a surrogate, output the first as U + FFFD
// and the second as is
builder.push(constr.replacement);
builder.push(String.fromCharCode(codeunit));
this.surrogate = -1;
}
} else {
if ((codeunit & 0xfc00) == 0xdc00) {
// unpaired low surrogate
builder.push(constr.replacement);
} else if ((codeunit & 0xfc00) == 0xd800) {
// valid low surrogate
this.surrogate = codeunit;
} else {
// not a surrogate
builder.push(String.fromCharCode(codeunit));
}
}
};
prototype.reset = function() {
this.surrogate = -1;
this.lastByte = -1;
}
})(CodeUnitAppender.prototype, CodeUnitAppender);
var Utf7 = function() {
};
Utf7.Alphabet = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1,
-1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 ];
Utf7.prototype.decode = function(input) {
if (typeof input == "string") {
var bytes = []
for ( var i = 0; i < input.length; i++) {
// treat char codes beyond 0xFF as 0xFF; they will
// be considered invalid for the purposes of UTF-7 decoding
bytes[i] = Math.min(0xFF, input.charCodeAt(i));
}
input = new Uint8Array(bytes);
}
var builder = []
this.readUtf7(input, builder, true);
return builder.join("");
};
Utf7.prototype.readUtf7 = function(input, builder) {
if (input == null) {
throw new Error("stream");
}
if (builder == null) {
throw new Error("builder");
}
var alphavalue = 0;
var base64value = 0;
var base64count = 0;
var index = input.byteOffset;
var endIndex = input.byteOffset + input.byteLength;
var appender = new CodeUnitAppender();
var state = 0; // 0: not in base64; 1: start of base 64; 2: continuing base64
while (true) {
var b;
switch (state) {
case 0: // not in base64
b = (index >= endIndex) ? -1 : input[index++];
if (b < 0) {
// done
return;
}
if (b == 0x09 || b == 0x0a || b == 0x0d) {
builder.push(String.fromCharCode(b));
} else if (b == 0x5c || b >= 0x7e || b < 0x20) {
// Illegal byte in UTF-7
builder.push("\ufffd");
} else if (b == 0x2b) {
// plus sign
state = 1; // change state to "start of base64"
base64value = 0;
base64count = 0;
appender.reset();
} else {
builder.push(String.fromCharCode(b));
}
break;
case 1: // start of base64
b = (index >= endIndex) ? -1 : input[index++];
if (b < 0) {
// End of stream, illegal
state = 0;
builder.push("\ufffd");
return;
}
if (b == 0x2d) {
// hyphen, so output a plus sign
state = 0;
builder.push('+');
} else if (b >= 0x80) {
// Non-ASCII byte, illegal
state = 0;
builder.push("\ufffd"); // for the illegal plus
builder.push("\ufffd"); // for the illegal non-ASCII byte
} else {
alphavalue = Utf7.Alphabet[b];
if (alphavalue >= 0) {
state = 2; // change state to "continuing base64"
base64value <<= 6;
base64value |= alphavalue;
++base64count;
} else {
// Non-base64 byte (NOTE: Can't be plus or
// minus at this point)
state = 0;
builder.push("\ufffd"); // for the illegal plus
if (b == 0x09 || b == 0x0a || b == 0x0d) {
builder.push(String.fromCharCode(b));
} else if (b == 0x5c || b >= 0x7e || b < 0x20) {
// Illegal byte in UTF-7
builder.push("\ufffd");
} else {
builder.push(String.fromCharCode(b));
}
}
}
break;
case 2: // continuing base64
b = (index >= endIndex) ? -1 : input[index++];
alphavalue = (b < 0 || b >= 0x80) ? -1 : Utf7.Alphabet[b];
if (alphavalue >= 0) {
// Base64 alphabet (except padding)
base64value <<= 6;
base64value |= alphavalue;
++base64count;
if (base64count == 4) {
// Generate UTF-16 bytes
appender.appendByte((base64value >> 16) & 0xff, builder);
appender.appendByte((base64value >> 8) & 0xff, builder);
appender.appendByte(base64value & 0xff, builder);
base64count = 0;
}
} else {
state = 0;
if (base64count == 1) {
// incomplete base64 byte
appender.appendIncompleteByte();
} else if (base64count == 2) {
base64value <<= 12;
appender.appendByte((base64value >> 16) & 0xff, builder);
if ((base64value & 0xffff) != 0) {
// Redundant pad bits
appender.appendIncompleteByte();
}
} else if (base64count == 3) {
base64value <<= 6;
appender.appendByte((base64value >> 16) & 0xff, builder);
appender.appendByte((base64value >> 8) & 0xff, builder);
if ((base64value & 0xff) != 0) {
// Redundant pad bits
appender.appendIncompleteByte();
}
}
appender.finalizeAndReset(builder);
if (b < 0) {
// End of stream
return;
} else if (b == 0x2d) {
// Ignore the hyphen
} else if (b == 0x09 || b == 0x0a || b == 0x0d) {
builder.push(String.fromCharCode(b));
} else if (b == 0x5c || b >= 0x7e || b < 0x20) {
// Illegal byte in UTF-7
builder.push("\ufffd");
} else {
builder.push(String.fromCharCode(b));
}
}
break;
default:
throw new IllegalStateException("Unexpected state");
}
}
};
/*
Written by Peter O. in 2014.
Any copyright is dedicated to the Public Domain.
http://creativecommons.org/publicdomain/zero/1.0/
If you like this, you should donate to Peter O.
at: http://upokecenter.com/d/
*/
function testUtf7One(input,expected) {
if(new Utf7().decode(input)!=expected){
console.log("failure "+[input,expected])
}
}
function testUtf7() {
testUtf7One("\\", "\ufffd");
testUtf7One("~", "\ufffd");
testUtf7One("\u0001", "\ufffd");
testUtf7One("\u007f", "\ufffd");
testUtf7One("\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?", "\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?");
testUtf7One("x+--", "x+-");
testUtf7One("x+-y", "x+y");
// Illegal byte after plus
testUtf7One("+!", "\ufffd!");
testUtf7One("+\n", "\ufffd\n");
testUtf7One("+\u007f", "\ufffd\ufffd");
testUtf7One("+", "\ufffd");
// Incomplete byte
testUtf7One("+D?", "\ufffd?");
testUtf7One("+D\u007f", "\ufffd\ufffd");
testUtf7One("+D", "\ufffd");
// Only one UTF-16 byte
testUtf7One("+DE?", "\ufffd?");
testUtf7One("+DE", "\ufffd");
testUtf7One("+DE\u007f", "\ufffd\ufffd");
// UTF-16 code unit
testUtf7One("+DEE?", "\u0c41?");
testUtf7One("+DEE", "\u0c41");
testUtf7One("+DEE\u007f", "\u0c41\ufffd");
// UTF-16 code unit (redundant pad bit)
testUtf7One("+DEF?", "\u0c41\ufffd?");
testUtf7One("+DEF", "\u0c41\ufffd");
testUtf7One("+DEF\u007f", "\u0c41\ufffd\ufffd");
// High surrogate code unit
testUtf7One("+2AA?", "\ufffd?");
testUtf7One("+2AA", "\ufffd");
testUtf7One("+2AA\u007f", "\ufffd\ufffd");
// Low surrogate code unit
testUtf7One("+3AA?", "\ufffd?");
testUtf7One("+3AA", "\ufffd");
testUtf7One("+3AA\u007f", "\ufffd\ufffd");
// Surrogate pair
testUtf7One("+2ADcAA?", "\ud800\udc00?");
testUtf7One("+2ADcAA", "\ud800\udc00");
testUtf7One("+2ADcAA\u007f", "\ud800\udc00\ufffd");
// High surrogate followed by surrogate pair
testUtf7One("+2ADYANwA?", "\ufffd\ud800\udc00?");
testUtf7One("+2ADYANwA", "\ufffd\ud800\udc00");
testUtf7One("+2ADYANwA\u007f", "\ufffd\ud800\udc00\ufffd");
// High surrogate followed by non-surrogate
testUtf7One("+2AAAwA?", "\ufffd\u00c0?");
testUtf7One("+2AAAwA", "\ufffd\u00c0");
testUtf7One("+2AAAwA\u007f", "\ufffd\u00c0\ufffd");
// Two UTF-16 code units
testUtf7One("+AMAA4A?", "\u00c0\u00e0?");
testUtf7One("+AMAA4A", "\u00c0\u00e0");
testUtf7One("+AMAA4A-Next", "\u00c0\u00e0Next");
testUtf7One("+AMAA4A!Next", "\u00c0\u00e0!Next");
testUtf7One("+AMAA4A\u007f", "\u00c0\u00e0\ufffd");
// Two UTF-16 code units (redundant pad bit)
testUtf7One("+AMAA4B?", "\u00c0\u00e0\ufffd?");
testUtf7One("+AMAA4B", "\u00c0\u00e0\ufffd");
testUtf7One("+AMAA4B-Next", "\u00c0\u00e0\ufffdNext");
testUtf7One("+AMAA4B!Next", "\u00c0\u00e0\ufffd!Next");
testUtf7One("+AMAA4B\u007f", "\u00c0\u00e0\ufffd\ufffd");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment