Last active
January 23, 2019 19:27
-
-
Save ethanresnick/e86661f15c4ea780dc6fbeb44dee9b74 to your computer and use it in GitHub Desktop.
Fix "JSON" that contains invalid strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Matches two double quotes and any characters between them, without stopping | |
// at backslash-escaped double quotes that appear in the middle. This is a lot | |
// like JS's string literal syntax, except that it will some match characters | |
// between the double quotes that JS would require be backslash escaped -- | |
// most notably, the newline, which must be \n in string literals. | |
// Note: we use [^] instead of . below to match any character because JS | |
// doesn't change the meaning of the dot even in the precense of the | |
// multiline flag. | |
const STRING_LITERAL_LIKE = /"([^"\\]|\\[^])*"/g; | |
// Matches raw ascii control characters and errant backslashes. | |
// See test file for description of how some errant backslashes | |
// can't be matched because they accidentally escape string close | |
// quotes (directly or indirectly). | |
const CONTROL_CHAR = /[\u0000-\u001F]/; | |
const ERRANT_BACKSLASH = /\\(?!([btnfr\\/"]|(u\d{4})))/; | |
const INVALID_JSON_STRING_CHARACTER = new RegExp( | |
"(?:" + CONTROL_CHAR.source + ")" | |
+ "|" | |
+ "(?:" + ERRANT_BACKSLASH.source + ")", | |
"g" | |
); | |
// The characters that JSON strings use, after a backslash, to represent a | |
// character that must be backslash escaped, keyed by their ascii code. | |
const JSON_CHAR_NAMES = { | |
// control character codes to replace with special letters | |
8: 'b', | |
9: 't', | |
10: 'n', | |
12: 'f', | |
13: 'r', | |
// characters that are allowed to or must have a leading backslash, | |
// but should be passed through as is after the backslash. | |
92: '\\', | |
47: '/' | |
}; | |
// Takes a single character string where the character is an ASCII control | |
// character or a backslash and returns the escape sequence used for | |
// representing that character in a JSON string. | |
function escapeJSONStringChar(char) { | |
const code = char.charCodeAt(0); | |
const charName = JSON_CHAR_NAMES[code]; | |
if(charName) { | |
return "\\" + charName; | |
} | |
const hexEscape = code.toString(16); | |
return "\\u00" + (hexEscape.length === 1 ? '0' : '') + hexEscape; | |
} | |
// Takes a string that's almost valid JSON except that the control characters | |
// and backslashes inside of string literals haven't been properly escaped, | |
// and returns a true JSON string formed by escaping those characters. | |
function fixJSON(jsonLike) { | |
return jsonLike.replace(STRING_LITERAL_LIKE, function (match) { | |
return match.replace(INVALID_JSON_STRING_CHARACTER, escapeJSONStringChar); | |
}); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const jsc = require("jsverify"); | |
const R = require("ramda"); | |
const { fixJSON } = require("./index"); | |
// The generator for improper JSON strings that we'll | |
// feed into our program. | |
const borkedJSONStringLiteral = jsc.suchthat( | |
jsc.string.smap( | |
// This is likely what the website's doing to build their "JSON". | |
x => `"${x.replace('"', '\\"')}"`, | |
x => x.substring(1, x.length - 1).replace('\\"', '"') | |
), | |
isHandleableInput | |
); | |
/** | |
* A function used by our generator above to skip generated strings | |
* that our code can't handle. As described below, there are a few | |
* cases that are deeply ambiguous and where we're ok if our code | |
* blows up, so we want to filter out inputs that match those cases. | |
*/ | |
function isHandleableInput(potentialString) { | |
// If our generated borked string ends with BACKSLASH DQUOTE, | |
// (because the string we tried to wrap in quotes ended with a | |
// literal backslash), that results in an unterminated string | |
// literal that we can't fix (because we won't know irl where | |
// the string really should end), so we don't handle that case. | |
if(potentialString.endsWith('\\"')) { | |
return false; | |
} | |
// We do the same if the result ends with two quotes, and the | |
// second to last quote isn't escaped (imagine a literal | |
// BACKSLASH DQUOTE string that goes through our smap above | |
// and becomes DQUOTE BACKSLASH BACKSLASH DQUOTE DQUOTE). | |
if(potentialString.endsWith('""')) { | |
// count backslashes before the last two quotes. | |
// even number means second to last quote isn't escaped. | |
const backslashCount = R.takeWhile( | |
it => it === '\\', | |
potentialString.split("").reverse().join("").substring(2), | |
).length; | |
return (backslashCount % 2) === 0 ? false : true; | |
} | |
return true; | |
} | |
} | |
describe("Fix strings", () => { | |
it("should generate parseable output (manual cases)", () => { | |
JSON.parse(fixJSON("\"\\\n\"")); | |
}); | |
it("should generate parseable output (auto cases)", function() { | |
this.timeout(Infinity); | |
jsc.assert( | |
jsc.forall(borkedJSONStringLiteral, (str) => { | |
try { | |
JSON.parse(fixJSON(str)); | |
return true; | |
} catch (e) { | |
console.log(str, fixJSON(str)); | |
return false; | |
} | |
}), | |
{ tests: 8000 } | |
); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment