Last active
August 29, 2015 14:02
-
-
Save peteroupc/08c5ecc8131a76062ffe to your computer and use it in GitHub Desktop.
UTF-7 decoder in JavaScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Written by Peter O. in 2014. | |
Any copyright is dedicated to the Public Domain. | |
http://creativecommons.org/publicdomain/zero/1.0/ | |
If you like this, you should donate to Peter O. | |
at: http://upokecenter.com/d/ | |
*/ | |
var CodeUnitAppender = function() { | |
this.surrogate = -1; | |
this.lastByte = -1; | |
}; | |
(function(prototype, constr) { | |
constr.replacement = "\ufffd" | |
prototype.finalizeAndReset = function(builder) { | |
if (this.surrogate >= 0 && this.lastByte >= 0) { | |
// Unpaired surrogate and an unpaired byte value | |
builder.push(constr.replacement); | |
builder.push(constr.replacement); | |
} else if (this.surrogate >= 0 || this.lastByte >= 0) { | |
// Unpaired surrogate or byte value remains | |
builder.push(constr.replacement); | |
} | |
this.surrogate = -1; | |
this.lastByte = -1; | |
}; | |
prototype.appendIncompleteByte = function() { | |
// Make sure lastByte isn't -1, for finalizeAndReset | |
// purposes | |
this.lastByte = 0; | |
}; | |
prototype.appendByte = function(value, builder) { | |
if (this.lastByte >= 0) { | |
var codeunit = this.lastByte << 8; | |
codeunit |= value & 0xff; | |
this.appendCodeUnit(codeunit, builder); | |
this.lastByte = -1; | |
} else { | |
this.lastByte = value; | |
} | |
}; | |
prototype.appendCodeUnit = function(codeunit, builder) { | |
if (this.surrogate >= 0) { | |
// If we have a surrogate, "codeunit" | |
// must be a valid "low surrogate" to complete the pair | |
if ((codeunit & 0xfc00) == 0xdc00) { | |
// valid low surrogate | |
builder.push(String.fromCharCode(this.surrogate)); | |
builder.push(String.fromCharCode(codeunit)); | |
this.surrogate = -1; | |
} else if ((codeunit & 0xfc00) == 0xd800) { | |
// unpaired high surrogate | |
builder.push(constr.replacement); | |
this.surrogate = codeunit; | |
} else { | |
// not a surrogate, output the first as U + FFFD | |
// and the second as is | |
builder.push(constr.replacement); | |
builder.push(String.fromCharCode(codeunit)); | |
this.surrogate = -1; | |
} | |
} else { | |
if ((codeunit & 0xfc00) == 0xdc00) { | |
// unpaired low surrogate | |
builder.push(constr.replacement); | |
} else if ((codeunit & 0xfc00) == 0xd800) { | |
// valid low surrogate | |
this.surrogate = codeunit; | |
} else { | |
// not a surrogate | |
builder.push(String.fromCharCode(codeunit)); | |
} | |
} | |
}; | |
prototype.reset = function() { | |
this.surrogate = -1; | |
this.lastByte = -1; | |
} | |
})(CodeUnitAppender.prototype, CodeUnitAppender); | |
var Utf7 = function() { | |
}; | |
Utf7.Alphabet = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, | |
-1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, | |
57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, | |
9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, | |
-1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | |
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1 ]; | |
Utf7.prototype.decode = function(input) { | |
if (typeof input == "string") { | |
var bytes = [] | |
for ( var i = 0; i < input.length; i++) { | |
// treat char codes beyond 0xFF as 0xFF; they will | |
// be considered invalid for the purposes of UTF-7 decoding | |
bytes[i] = Math.min(0xFF, input.charCodeAt(i)); | |
} | |
input = new Uint8Array(bytes); | |
} | |
var builder = [] | |
this.readUtf7(input, builder, true); | |
return builder.join(""); | |
}; | |
Utf7.prototype.readUtf7 = function(input, builder) { | |
if (input == null) { | |
throw new Error("stream"); | |
} | |
if (builder == null) { | |
throw new Error("builder"); | |
} | |
var alphavalue = 0; | |
var base64value = 0; | |
var base64count = 0; | |
var index = input.byteOffset; | |
var endIndex = input.byteOffset + input.byteLength; | |
var appender = new CodeUnitAppender(); | |
var state = 0; // 0: not in base64; 1: start of base 64; 2: continuing base64 | |
while (true) { | |
var b; | |
switch (state) { | |
case 0: // not in base64 | |
b = (index >= endIndex) ? -1 : input[index++]; | |
if (b < 0) { | |
// done | |
return; | |
} | |
if (b == 0x09 || b == 0x0a || b == 0x0d) { | |
builder.push(String.fromCharCode(b)); | |
} else if (b == 0x5c || b >= 0x7e || b < 0x20) { | |
// Illegal byte in UTF-7 | |
builder.push("\ufffd"); | |
} else if (b == 0x2b) { | |
// plus sign | |
state = 1; // change state to "start of base64" | |
base64value = 0; | |
base64count = 0; | |
appender.reset(); | |
} else { | |
builder.push(String.fromCharCode(b)); | |
} | |
break; | |
case 1: // start of base64 | |
b = (index >= endIndex) ? -1 : input[index++]; | |
if (b < 0) { | |
// End of stream, illegal | |
state = 0; | |
builder.push("\ufffd"); | |
return; | |
} | |
if (b == 0x2d) { | |
// hyphen, so output a plus sign | |
state = 0; | |
builder.push('+'); | |
} else if (b >= 0x80) { | |
// Non-ASCII byte, illegal | |
state = 0; | |
builder.push("\ufffd"); // for the illegal plus | |
builder.push("\ufffd"); // for the illegal non-ASCII byte | |
} else { | |
alphavalue = Utf7.Alphabet[b]; | |
if (alphavalue >= 0) { | |
state = 2; // change state to "continuing base64" | |
base64value <<= 6; | |
base64value |= alphavalue; | |
++base64count; | |
} else { | |
// Non-base64 byte (NOTE: Can't be plus or | |
// minus at this point) | |
state = 0; | |
builder.push("\ufffd"); // for the illegal plus | |
if (b == 0x09 || b == 0x0a || b == 0x0d) { | |
builder.push(String.fromCharCode(b)); | |
} else if (b == 0x5c || b >= 0x7e || b < 0x20) { | |
// Illegal byte in UTF-7 | |
builder.push("\ufffd"); | |
} else { | |
builder.push(String.fromCharCode(b)); | |
} | |
} | |
} | |
break; | |
case 2: // continuing base64 | |
b = (index >= endIndex) ? -1 : input[index++]; | |
alphavalue = (b < 0 || b >= 0x80) ? -1 : Utf7.Alphabet[b]; | |
if (alphavalue >= 0) { | |
// Base64 alphabet (except padding) | |
base64value <<= 6; | |
base64value |= alphavalue; | |
++base64count; | |
if (base64count == 4) { | |
// Generate UTF-16 bytes | |
appender.appendByte((base64value >> 16) & 0xff, builder); | |
appender.appendByte((base64value >> 8) & 0xff, builder); | |
appender.appendByte(base64value & 0xff, builder); | |
base64count = 0; | |
} | |
} else { | |
state = 0; | |
if (base64count == 1) { | |
// incomplete base64 byte | |
appender.appendIncompleteByte(); | |
} else if (base64count == 2) { | |
base64value <<= 12; | |
appender.appendByte((base64value >> 16) & 0xff, builder); | |
if ((base64value & 0xffff) != 0) { | |
// Redundant pad bits | |
appender.appendIncompleteByte(); | |
} | |
} else if (base64count == 3) { | |
base64value <<= 6; | |
appender.appendByte((base64value >> 16) & 0xff, builder); | |
appender.appendByte((base64value >> 8) & 0xff, builder); | |
if ((base64value & 0xff) != 0) { | |
// Redundant pad bits | |
appender.appendIncompleteByte(); | |
} | |
} | |
appender.finalizeAndReset(builder); | |
if (b < 0) { | |
// End of stream | |
return; | |
} else if (b == 0x2d) { | |
// Ignore the hyphen | |
} else if (b == 0x09 || b == 0x0a || b == 0x0d) { | |
builder.push(String.fromCharCode(b)); | |
} else if (b == 0x5c || b >= 0x7e || b < 0x20) { | |
// Illegal byte in UTF-7 | |
builder.push("\ufffd"); | |
} else { | |
builder.push(String.fromCharCode(b)); | |
} | |
} | |
break; | |
default: | |
throw new IllegalStateException("Unexpected state"); | |
} | |
} | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Written by Peter O. in 2014. | |
Any copyright is dedicated to the Public Domain. | |
http://creativecommons.org/publicdomain/zero/1.0/ | |
If you like this, you should donate to Peter O. | |
at: http://upokecenter.com/d/ | |
*/ | |
function testUtf7One(input,expected) { | |
if(new Utf7().decode(input)!=expected){ | |
console.log("failure "+[input,expected]) | |
} | |
} | |
function testUtf7() { | |
testUtf7One("\\", "\ufffd"); | |
testUtf7One("~", "\ufffd"); | |
testUtf7One("\u0001", "\ufffd"); | |
testUtf7One("\u007f", "\ufffd"); | |
testUtf7One("\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?", "\r\n\t '!\"#'(),$-%@[]^&=<>;*_`{}./:|?"); | |
testUtf7One("x+--", "x+-"); | |
testUtf7One("x+-y", "x+y"); | |
// Illegal byte after plus | |
testUtf7One("+!", "\ufffd!"); | |
testUtf7One("+\n", "\ufffd\n"); | |
testUtf7One("+\u007f", "\ufffd\ufffd"); | |
testUtf7One("+", "\ufffd"); | |
// Incomplete byte | |
testUtf7One("+D?", "\ufffd?"); | |
testUtf7One("+D\u007f", "\ufffd\ufffd"); | |
testUtf7One("+D", "\ufffd"); | |
// Only one UTF-16 byte | |
testUtf7One("+DE?", "\ufffd?"); | |
testUtf7One("+DE", "\ufffd"); | |
testUtf7One("+DE\u007f", "\ufffd\ufffd"); | |
// UTF-16 code unit | |
testUtf7One("+DEE?", "\u0c41?"); | |
testUtf7One("+DEE", "\u0c41"); | |
testUtf7One("+DEE\u007f", "\u0c41\ufffd"); | |
// UTF-16 code unit (redundant pad bit) | |
testUtf7One("+DEF?", "\u0c41\ufffd?"); | |
testUtf7One("+DEF", "\u0c41\ufffd"); | |
testUtf7One("+DEF\u007f", "\u0c41\ufffd\ufffd"); | |
// High surrogate code unit | |
testUtf7One("+2AA?", "\ufffd?"); | |
testUtf7One("+2AA", "\ufffd"); | |
testUtf7One("+2AA\u007f", "\ufffd\ufffd"); | |
// Low surrogate code unit | |
testUtf7One("+3AA?", "\ufffd?"); | |
testUtf7One("+3AA", "\ufffd"); | |
testUtf7One("+3AA\u007f", "\ufffd\ufffd"); | |
// Surrogate pair | |
testUtf7One("+2ADcAA?", "\ud800\udc00?"); | |
testUtf7One("+2ADcAA", "\ud800\udc00"); | |
testUtf7One("+2ADcAA\u007f", "\ud800\udc00\ufffd"); | |
// High surrogate followed by surrogate pair | |
testUtf7One("+2ADYANwA?", "\ufffd\ud800\udc00?"); | |
testUtf7One("+2ADYANwA", "\ufffd\ud800\udc00"); | |
testUtf7One("+2ADYANwA\u007f", "\ufffd\ud800\udc00\ufffd"); | |
// High surrogate followed by non-surrogate | |
testUtf7One("+2AAAwA?", "\ufffd\u00c0?"); | |
testUtf7One("+2AAAwA", "\ufffd\u00c0"); | |
testUtf7One("+2AAAwA\u007f", "\ufffd\u00c0\ufffd"); | |
// Two UTF-16 code units | |
testUtf7One("+AMAA4A?", "\u00c0\u00e0?"); | |
testUtf7One("+AMAA4A", "\u00c0\u00e0"); | |
testUtf7One("+AMAA4A-Next", "\u00c0\u00e0Next"); | |
testUtf7One("+AMAA4A!Next", "\u00c0\u00e0!Next"); | |
testUtf7One("+AMAA4A\u007f", "\u00c0\u00e0\ufffd"); | |
// Two UTF-16 code units (redundant pad bit) | |
testUtf7One("+AMAA4B?", "\u00c0\u00e0\ufffd?"); | |
testUtf7One("+AMAA4B", "\u00c0\u00e0\ufffd"); | |
testUtf7One("+AMAA4B-Next", "\u00c0\u00e0\ufffdNext"); | |
testUtf7One("+AMAA4B!Next", "\u00c0\u00e0\ufffd!Next"); | |
testUtf7One("+AMAA4B\u007f", "\u00c0\u00e0\ufffd\ufffd"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment