Last active
February 21, 2019 05:43
-
-
Save mikesamuel/20710f94a53e440691f04bf79bc3d756 to your computer and use it in GitHub Desktop.
A canonicalizing function to make it easy to hash JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
// Prompted by https://esdiscuss.org/topic/json-canonicalize | |
// Given a string of JSON produces a string of JSON without unnecessary | |
// degrees of freedom like whitespace, optional escape sequences, and | |
// unnecessary variance in number representation. | |
function hashable(json) { | |
const strs = [] // Side table to collect string bodies | |
return reorderProperties( | |
strs, | |
json | |
// Store string content in a side-table. | |
// This makes the content is still valid JSON but without | |
// any strings that contain whitespace, commas, colons or brackets. | |
.replace(/"(?:[^\\"]|\\.)*"/g, (x) => { strs.push(x); return `"@${strs.length-1}"` }) | |
// Eliminate all whitespace. | |
.replace(/[\x00-\x20]+/g, '') | |
// Canonicalize all numbers. | |
.replace(/([+\-]?)(?=[\d.])(\d*)(?:[.](\d*))?(?:[eE]([+\-]?\d+))?(?=[,\]\}]|$)/g, canonNumber)) | |
// Restore the numbers from the side table | |
.replace(/"@(\d+)"/g, canonStr.bind(null, strs)) | |
} | |
// RegExp replacer function that canonicalizes a number even if it | |
// is not precisely representable in JavaScript. | |
function canonNumber(_, sign, integer, fraction, mantissa) { | |
if (sign !== '-') { sign = '' } | |
integer = integer || '0' | |
fraction = fraction || '' | |
mantissa = (+mantissa || 0) + integer.length - 1 | |
let digits = (integer + fraction) | |
// Strip zeroes from front | |
.replace(/^(?:0(?!$))+/, s => (mantissa -= s.length, '')) || '0' | |
if (digits === '0') { mantissa = 0 } | |
// Exponential notation. | |
if (-26 >= mantissa || mantissa >= 26) { | |
const afterdot = digits.substring(1).replace(/0+$/, '') | |
return sign + digits[0] + (afterdot ? `.${afterdot}` : '') + 'e' + mantissa | |
} | |
return (mantissa < 0) | |
// Pad zeroes left | |
// digits=123456, mantissa=-1 -> 0.123456 | |
? sign + '0.' + '0'.repeat(-mantissa - 1) + digits | |
: (mantissa + 1 < digits.length) | |
// Dot in the middle | |
? sign + digits.substring(0, mantissa + 1) + '.' + digits.substring(mantissa + 1) | |
// Pad zeroes right | |
// digits=123456, mantissa=6 -> 1234560 | |
: sign + digits + '0'.repeat(mantissa - digits.length + 1) | |
} | |
// Pulls a string out of a side table. "@1" -> strs[1] but with escape sequences normalized. | |
const canonStr = (strs, s) => JSON.stringify(JSON.parse(strs[s.substring(2, s.length - 1)])) | |
// Recursively reorder properties on JSON {}-style records | |
function reorderProperties(strs, s) { | |
// Pull out tokens and their indices as cues to structure | |
const tokens = [] | |
const tokenPattern = /[{}\[\],]/g | |
for (let match; (match = tokenPattern.exec(s));) { | |
tokens.push(match) // Match has form ['{'] but with index propert. | |
} | |
if (!tokens.length) { return s } | |
const memoTable = [] | |
function canonKey(strIndex) { | |
if (memoTable[strIndex] === undefined) { | |
memoTable[strIndex] = JSON.parse(strs[strIndex]) | |
} | |
return memoTable[strIndex] | |
} | |
// Walk the structure and rebuild the output but ordered. | |
return reorder(0, tokens.length - 1) | |
function reorder(left, right) { | |
const { "0": token, index } = tokens[left] | |
let rangeStart = left, valueStart = null | |
let depth, cursor | |
let ranges = [] | |
for (depth = 1, cursor = left + 1; cursor < right; ++cursor) { | |
switch (tokens[cursor][0]) { | |
case '{': case '[': | |
if (depth === 1) { valueStart = cursor } | |
++depth | |
break | |
case ']': case '}': --depth; break | |
case ',': | |
if (depth === 1) { | |
ranges.push([rangeStart, valueStart, cursor]) | |
rangeStart = cursor | |
valueStart = null | |
} | |
} | |
} | |
ranges.push([rangeStart, valueStart, right]) | |
ranges = ranges.map( | |
([start, valueStart, end]) => (valueStart === null | |
// No nested object or array | |
? s.substring(tokens[start].index + 1, tokens[end].index) | |
// Recurse to nested object or array | |
: s.substring(tokens[start].index + 1, tokens[valueStart].index) + reorder(valueStart, end - 1))) | |
if (token === '{') { | |
ranges.sort((a, b) => { | |
const aKey = canonKey(/^"@(\d+)":/.exec(a)[1]) | |
const bKey = canonKey(/^"@(\d+)":/.exec(b)[1]) | |
return aKey < bKey ? -1 : aKey === bKey ? 0 : 1 | |
}) | |
} | |
return token + ranges.join(',') + tokens[right][0] | |
} | |
} | |
if (typeof module !== 'undefined') { module.exports = { hashable } } | |
// Tests for canonNumber | |
void ([ | |
{ input: ['', '123', '456', '0'], want: '123.456' }, | |
{ input: ['+', '123', '456', '0'], want: '123.456' }, | |
{ input: ['-', '123', '456', '0'], want: '-123.456' }, | |
{ input: ['', '123', '456', '30'], want: '1.23456e32' }, | |
{ input: ['', '123', '456', '-30'], want: '1.23456e-28' }, | |
{ input: ['-', '123', '456', '-30'], want: '-1.23456e-28' }, | |
{ input: ['', '123', '456', '-7'], want: '0.0000123456' }, | |
{ input: ['', '123', '456', '-6'], want: '0.000123456' }, | |
{ input: ['', '123', '456', '-5'], want: '0.00123456' }, | |
{ input: ['', '123', '456', '-4'], want: '0.0123456' }, | |
{ input: ['', '123', '456', '-3'], want: '0.123456' }, | |
{ input: ['', '123', '456', '-2'], want: '1.23456' }, | |
{ input: ['', '123', '456', '-1'], want: '12.3456' }, | |
{ input: ['', '123', '456', '-0'], want: '123.456' }, | |
{ input: ['', '123', '456', '0'], want: '123.456' }, | |
{ input: ['', '123', '456', '1'], want: '1234.56' }, | |
{ input: ['', '123', '456', '2'], want: '12345.6' }, | |
{ input: ['', '123', '456', '3'], want: '123456' }, | |
{ input: ['', '123', '456', '4'], want: '1234560' }, | |
{ input: ['', '123', '456', '5'], want: '12345600' }, | |
{ input: ['', '123', '456', '6'], want: '123456000' }, | |
{ input: ['', '123', '456', '7'], want: '1234560000' }, | |
{ input: ['', '0123', '456', '2'], want: '12345.6' }, | |
{ input: ['', '0', '05', '-2'], want: '0.0005' }, | |
{ input: ['', '0', '05', '2'], want: '5' }, | |
].forEach(({ input, want }, i) => { | |
console.group(`Test #${i}: ${JSON.stringify(input)}`) | |
const got = canonNumber(0, ...input) | |
console.log(got) | |
if (got !== want) { | |
throw new Error('expected ' + want) | |
} | |
console.groupEnd() | |
})) | |
// Tests for hashable | |
void ([ | |
{ input: String.raw`0`, want: String.raw`0` }, | |
{ input: String.raw`0.0`, want: String.raw`0` }, | |
{ input: String.raw`00`, want: String.raw`0` }, | |
{ input: String.raw`1e100`, want: String.raw`1e100` }, | |
{ input: String.raw`"foo"`, want: String.raw`"foo"` }, | |
{ input: String.raw`{}`, want: String.raw`{}` }, | |
{ input: String.raw`{ }`, want: String.raw`{}` }, | |
{ input: String.raw`{ "c": 1, "a": 2, "b": 3 }`, want: String.raw`{"a":2,"b":3,"c":1}` }, | |
{ input: String.raw`{ "c": 1, "a": { "~": 2, "b": 3 }}`, want: String.raw`{"a":{"b":3,"~":2},"c":1}` }, | |
{ input: String.raw`{ "": [] }`, want: String.raw`{"":[]}` }, | |
{ input: String.raw`[]`, want: String.raw`[]` }, | |
{ input: String.raw`[ ]`, want: String.raw`[]` }, | |
{ input: String.raw`"\u0022"`, want: String.raw`"\""` }, | |
{ input: String.raw`"\\\/"`, want: String.raw`"\\/"` }, | |
{ | |
input: String.raw`{ | |
"foo" : { | |
"boo": null, | |
"123.456e1" : [ 123.456e1, "foo\/bar\u0022"], | |
"bar":"baz" | |
} | |
} `, | |
want: String.raw`{"foo":{"123.456e1":[1234.56,"foo/bar\""],"bar":"baz","boo":null}}` | |
}, | |
].forEach(({ input, want }, i) => { | |
console.group(`Test #${i}: ${JSON.stringify(input)}`) | |
const got = hashable(input) | |
if (got !== want) { | |
throw new Error(`got ${got}, expected ${want}`) | |
} | |
console.groupEnd() | |
})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Mike,
Although the JSON canonicalization concept I'm working on is very different to the one above I have adopted a term I got from you "Hashable" JSON:
https://tools.ietf.org/html/draft-rundgren-json-canonicalization-scheme-05
One reviewer claimed it was a very confused term but I think it is perfect : )
Thanx!