-
-
Save creationix/5992451 to your computer and use it in GitHub Desktop.
// A streaming byte oriented JSON parser. Feed it a single byte at a time and | |
// it will emit complete objects as it comes across them. Whitespace within and | |
// between objects is ignored. This means it can parse newline delimited JSON. | |
function jsonMachine(emit, next) { | |
next = next || $value; | |
return $value; | |
function $value(byte) { | |
if (!byte) return; | |
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) { | |
return $value; // Ignore whitespace | |
} | |
if (byte === 0x22) { // " | |
return stringMachine(onValue); | |
} | |
if (byte === 0x2d || (byte >= 0x30 && byte < 0x40)) { // - or 0-9 | |
return numberMachine(byte, onNumber); | |
} | |
if (byte === 0x7b) { // { | |
return objectMachine(onValue); | |
} | |
if (byte === 0x5b) { // [ | |
return arrayMachine(onValue); | |
} | |
if (byte === 0x74) { // t | |
return constantMachine(TRUE, true, onValue); | |
} | |
if (byte === 0x66) { // f | |
return constantMachine(FALSE, false, onValue); | |
} | |
if (byte === 0x6e) { // n | |
return constantMachine(NULL, null, onValue); | |
} | |
if (next === $value) { | |
throw new Error("Unexpected 0x" + byte.toString(16)); | |
} | |
return next(byte); | |
} | |
function onValue(value) { | |
emit(value); | |
return next; | |
} | |
function onNumber(number, byte) { | |
emit(number); | |
return $value(byte); | |
} | |
} | |
var TRUE = [0x72, 0x75, 0x65]; | |
var FALSE = [0x61, 0x6c, 0x73, 0x65]; | |
var NULL = [0x75, 0x6c, 0x6c]; | |
function constantMachine(bytes, value, emit) { | |
var i = 0, l = bytes.length; | |
return $constant; | |
function $constant(byte) { | |
if (byte !== bytes[i++]) { | |
throw new Error("Unexpected 0x" + byte.toString(16)); | |
} | |
if (i < l) return $constant; | |
return emit(value); | |
} | |
} | |
function stringMachine(emit) { | |
var string = ""; | |
return $string; | |
function $string(byte) { | |
if (byte === 0x22) { // " | |
return emit(string); | |
} | |
if (byte === 0x5c) { // \ | |
return $escapedString; | |
} | |
if (byte & 0x80) { // UTF-8 handling | |
return utf8Machine(byte, onCharCode); | |
} | |
if (byte < 0x20) { // ASCII control character | |
throw new Error("Unexpected control character: 0x" + byte.toString(16)); | |
} | |
string += String.fromCharCode(byte); | |
return $string; | |
} | |
function $escapedString(byte) { | |
if (byte === 0x22 || byte === 0x5c || byte === 0x2f) { // " \ / | |
string += String.fromCharCode(byte); | |
return $string; | |
} | |
if (byte === 0x62) { // b | |
string += "\b"; | |
return $string; | |
} | |
if (byte === 0x66) { // f | |
string += "\f"; | |
return $string; | |
} | |
if (byte === 0x6e) { // n | |
string += "\n"; | |
return $string; | |
} | |
if (byte === 0x72) { // r | |
string += "\r"; | |
return $string; | |
} | |
if (byte === 0x74) { // t | |
string += "\t"; | |
return $string; | |
} | |
if (byte === 0x75) { // u | |
return hexMachine(onCharCode); | |
} | |
} | |
function onCharCode(charCode) { | |
string += String.fromCharCode(charCode); | |
return $string; | |
} | |
} | |
// Nestable state machine for UTF-8 Decoding. | |
function utf8Machine(byte, emit) { | |
var left = 0, num = 0; | |
if (byte >= 0xc0 && byte < 0xe0) { // 2-byte UTF-8 Character | |
left = 1; | |
num = (byte & 0x1f) << 6; | |
return $utf8; | |
} | |
if (byte >= 0xe0 && byte < 0xf0) { // 3-byte UTF-8 Character | |
left = 2; | |
num = (byte & 0xf) << 12; | |
return $utf8; | |
} | |
if (byte >= 0xf0 && byte < 0xf8) { // 4-byte UTF-8 Character | |
left = 3; | |
num = (byte & 0x07) << 18; | |
return $utf8; | |
} | |
throw new Error("Invalid byte in UTF-8 string: 0x" + byte.toString(16)); | |
function $utf8(byte) { | |
if ((byte & 0xc0) !== 0x80) { | |
throw new Error("Invalid byte in UTF-8 character: 0x" + byte.toString(16)); | |
} | |
num |= (byte & 0x3f) << (--left * 6); | |
if (left) return $utf8; | |
return emit(num); | |
} | |
} | |
// Nestable state machine for hex escaped characters | |
function hexMachine(emit) { | |
var left = 4, num = 0; | |
return $hex; | |
function $hex(byte) { | |
var i = 0; // Parse the hex byte | |
if (byte >= 0x30 && byte < 0x40) i = byte - 0x30; | |
else if (byte >= 0x61 && byte <= 0x66) i = byte - 0x57; | |
else if (byte >= 0x41 && byte <= 0x46) i = byte - 0x37; | |
else throw new Error("Expected hex char in string hex escape"); | |
num |= i << (--left * 4); | |
if (left) return $hex; | |
return emit(num); | |
} | |
} | |
function numberMachine(byte, emit) { | |
var sign = 1; | |
var number = 0; | |
var decimal = 0; | |
var esign = 1; | |
var exponent = 0; | |
if (byte === 0x2d) { // - | |
sign = -1; | |
return $start; | |
} | |
return $start(byte); | |
function $start(byte) { | |
if (byte === 0x30) { | |
return $mid; | |
} | |
if (byte > 0x30 && byte < 0x40) { | |
return $number(byte); | |
} | |
throw new Error("Invalid number: 0x" + byte.toString(16)); | |
} | |
function $mid(byte) { | |
if (byte === 0x2e) { // . | |
return $decimal; | |
} | |
return $later(byte); | |
} | |
function $number(byte) { | |
if (byte >= 0x30 && byte < 0x40) { | |
number = number * 10 + (byte - 0x30); | |
return $number; | |
} | |
return $mid(byte); | |
} | |
function $decimal(byte) { | |
if (byte >= 0x30 && byte < 0x40) { | |
decimal = (decimal + byte - 0x30) / 10; | |
return $decimal; | |
} | |
return $later(byte); | |
} | |
function $later(byte) { | |
if (byte === 0x45 || byte === 0x65) { // E e | |
return $esign; | |
} | |
return $done(byte); | |
} | |
function $esign(byte) { | |
if (byte === 0x2b) { // + | |
return $exponent; | |
} | |
if (byte === 0x2d) { // - | |
esign = -1; | |
return $exponent; | |
} | |
return $exponent(byte); | |
} | |
function $exponent(byte) { | |
if (byte >= 0x30 && byte < 0x40) { | |
exponent = exponent * 10 + (byte - 0x30); | |
return $exponent; | |
} | |
return $done(byte); | |
} | |
function $done(byte) { | |
var value = sign * (number + decimal); | |
if (exponent) { | |
value *= Math.pow(10, esign * exponent); | |
} | |
return emit(value, byte); | |
} | |
} | |
function arrayMachine(emit) { | |
var array = []; | |
return $array; | |
function $array(byte) { | |
if (byte === 0x5d) { // ] | |
return emit(array); | |
} | |
return jsonMachine(onValue, $comma)(byte); | |
} | |
function onValue(value) { | |
array.push(value); | |
} | |
function $comma(byte) { | |
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) { | |
return $comma; // Ignore whitespace | |
} | |
if (byte === 0x2c) { // , | |
return jsonMachine(onValue, $comma); | |
} | |
if (byte === 0x5d) { // ] | |
return emit(array); | |
} | |
throw new Error("Unexpected byte: 0x" + byte.toString(16) + " in array body"); | |
} | |
} | |
function objectMachine(emit) { | |
var object = {}; | |
var key; | |
return $object; | |
function $object(byte) { | |
if (byte === 0x7d) { // } | |
return emit(object); | |
} | |
return $key(byte); | |
} | |
function $key(byte) { | |
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) { | |
return $object; // Ignore whitespace | |
} | |
if (byte === 0x22) { | |
return stringMachine(onKey); | |
} | |
throw new Error("Unexpected byte: 0x" + byte.toString(16)); | |
} | |
function onKey(result) { | |
key = result; | |
return $colon; | |
} | |
function $colon(byte) { | |
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) { | |
return $colon; // Ignore whitespace | |
} | |
if (byte === 0x3a) { // : | |
return jsonMachine(onValue, $comma); | |
} | |
throw new Error("Unexpected byte: 0x" + byte.toString(16)); | |
} | |
function onValue(value) { | |
object[key] = value; | |
} | |
function $comma(byte) { | |
if (byte === 0x09 || byte === 0x0a || byte === 0x0d || byte === 0x20) { | |
return $comma; // Ignore whitespace | |
} | |
if (byte === 0x2c) { // , | |
return $key; | |
} | |
if (byte === 0x7d) { // } | |
return emit(object); | |
} | |
throw new Error("Unexpected byte: 0x" + byte.toString(16)); | |
} | |
} | |
var inspect = require("util").inspect; | |
var inputs = [ | |
'"this is a \\u5ee9 string" "so is this €"\r\n"How about ¢?"\t"詩檧窣廩 禨碜婨, 珦覵 氨焨鋨"', | |
'["a",1,[1,2,3]]', | |
'12345 6789', | |
'{"name":"Tim Caswell","age":31,"true":true,"false":false,"null":null}', | |
'-1 -1.1 -0.3 3.14e-3 10E5' | |
]; | |
inputs.forEach(function (input) { | |
var data = new Buffer(input); | |
var state = jsonMachine(emit); | |
for (var i = 0, l = data.length; i < l; i++) { | |
state = state(data[i]); | |
} | |
state(); | |
}); | |
function emit(value) { | |
console.log(inspect(value, {colors:true})); | |
} |
I can't follow this code. I come from a C# background. Can you explain the functions that are named with a $? I don't see where they are ever called.
I had the same problem digesting the parser in your ' chrome-app-module-loader'.
I'm currently writing some non-trivial chrome apps, and I really like the idea of your loader that allows modules to be developed with commonJS syntax.
me neither dude
The $
in the name doesn't mean anything. It could be a Z
or an _
and the language doesn't care.
I think what's confusing to many is this code returns functions as values in a lot of places and then calls those returned function values later. You'll need a good understanding of first class functions to understand this code. It's common in JavaScript (which has a lot of design from scheme), but not so much in languages like C# or Java.
To hook this up to a node stream with
data
andend
events, you can do the following: