Created
October 19, 2016 13:42
-
-
Save liftoff/ee7b81659673eca23cd9fc0d8b8e68b7 to your computer and use it in GitHub Desktop.
Allow comments and trailing commas in JSON files using two simple Python functions to clean them up before parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
An example of how to remove comments and trailing commas from JSON before | |
parsing. You only need the two functions below, `remove_comments()` and | |
`remove_trailing_commas()` to accomplish this. This script serves as an | |
example of how to use them but feel free to just copy & paste them into your | |
own code/projects. Usage:: | |
json_cleaner.py some_file.json | |
Alternatively, you can pipe JSON into this script and it'll clean it up:: | |
cat some_file.json | json_cleaner.py | |
Why would you do this? So you can have human-generated .json files | |
(say, for configuration) that include comments and, really, who wants to deal | |
with catching all those trailing commas that might be present? Here's an | |
example of a file that will be successfully cleaned up and JSON-parseable: | |
.. code-block:: javascript | |
{ | |
// A comment! You normally can't put these in JSON | |
"testing": { | |
"foo": "bar", // <-- A trailing comma! No worries. | |
}, // <-- Another one! | |
/* | |
This style of comments will also be safely removed before parsing | |
*/ | |
} | |
FYI: This script will also pretty-print the JSON after it's cleaned up (if | |
using it from the command line) with an indentation level of 4 (that is, four | |
spaces). | |
""" | |
__version__ = '1.0.0' | |
__version_info__ = (1, 0, 0) | |
__license__ = "Unlicense" | |
__author__ = 'Dan McDougall <[email protected]>' | |
import re, fileinput | |
try: | |
import ujson as json # Speedup if present; no big deal if not | |
except ImportError: | |
import json | |
def remove_comments(json_like): | |
""" | |
Removes C-style comments from *json_like* and returns the result. Example:: | |
>>> test_json = '''\ | |
{ | |
"foo": "bar", // This is a single-line comment | |
"baz": "blah" /* Multi-line | |
Comment */ | |
}''' | |
>>> remove_comments('{"foo":"bar","baz":"blah",}') | |
'{\n "foo":"bar",\n "baz":"blah"\n}' | |
""" | |
comments_re = re.compile( | |
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', | |
re.DOTALL | re.MULTILINE | |
) | |
def replacer(match): | |
s = match.group(0) | |
if s[0] == '/': return "" | |
return s | |
return comments_re.sub(replacer, json_like) | |
def remove_trailing_commas(json_like): | |
""" | |
Removes trailing commas from *json_like* and returns the result. Example:: | |
>>> remove_trailing_commas('{"foo":"bar","baz":["blah",],}') | |
'{"foo":"bar","baz":["blah"]}' | |
""" | |
trailing_object_commas_re = re.compile( | |
r'(,)\s*}(?=([^"\\]*(\\.|"([^"\\]*\\.)*[^"\\]*"))*[^"]*$)') | |
trailing_array_commas_re = re.compile( | |
r'(,)\s*\](?=([^"\\]*(\\.|"([^"\\]*\\.)*[^"\\]*"))*[^"]*$)') | |
# Fix objects {} first | |
objects_fixed = trailing_object_commas_re.sub("}", json_like) | |
# Now fix arrays/lists [] and return the result | |
return trailing_array_commas_re.sub("]", objects_fixed) | |
if __name__ == "__main__": | |
json_out = "" | |
for line in fileinput.input(): # Read it all in | |
json_out += line | |
almost_json = remove_comments(json_out) # Remove comments | |
proper_json = remove_trailing_commas(almost_json) # Remove trailing commas | |
validated = json.loads(proper_json) # We now have parseable JSON! | |
print(json.dumps(validated, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
objects_fixed = trailing_object_commas_re.sub("}", json_like)
fails if you have any unicode character.You can solve it by using
objects_fixed = trailing_object_commas_re.sub("}", json_like.decode('utf-8'))