-
-
Save perXautomatik/065327f4f5c7607dd2cb9d11d85e2bc2 to your computer and use it in GitHub Desktop.
A library for writing and reading JSON streams from files (optionally supports gz compression)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
JsonStream by Gareth George | |
LICENSE: The do whatever the heck you want with it license. | |
Do what you want with it, when you want to do it. I take no responsibility for any damages that may occur in the usage of this script. This means no guaranties as to its reliability or stability. | |
USAGE: | |
file = open_stream('filename.json', 'rb') # opens the file with the given file name to be read as a json stream | |
file = open_stream('filename.json.gz', 'rb') # opens the COMPRESSED file with the file name to be read as a json stream | |
file = open_stream('filename.json.gz', 'wb') # opens the COMPRESSED file for writing out json data. | |
It is worth noteing that these files must be opened in binary mode. This results from the implementation detail that the JSON stream class is a subclass of io.TextIOWrapper which expects the argument to its constructor to implement the binary stream API's. | |
Once you have a handle to a file writing to it is as simple as calling | |
file.write("text") to write some text... why you would do this I do not know. | |
file.writeJSON({"hello": "world"}) will write a single json object into the file. This function is unicode safe (hopefully). | |
file.readJSON() will read a single JSON object from the file however its usage is discouraged in favor of iterJSON since readJSON is slower. It is still provided however just incase you need it. | |
file.iterJSON() returns an iterator over all of the json objects found in the file. | |
usage examples for file.iterJSON() | |
json_objects = list(file.iterJSON()) # returns a list of every json object in the file. You can do this however it is discouraged because it requires you to load every object in the file into memory which can be impractacle for very large files. Note that even if the file is only a gigabyte compressed it may contain 8 gigabytes of JSON data so even seemingly small files can cause your memory usage to EXPLODE! | |
for obj in file.iterJSON(): | |
print(obj) # prints out every object in the JSON file. This is the preferred way of accessing the objects in a file as it allows you to lasily load them, print them, and then allow the garbage collector to clear the object out of memory! Happy times. | |
Enjoy! | |
""" | |
import os | |
import json | |
import io | |
try: | |
import gzip | |
except ImportError: pass | |
try: | |
import lzma | |
except ImportError: pass | |
try: | |
import bz2 | |
except ImportError: pass | |
DEFAULT_BLOCK_SIZE = 1024 * 4 | |
class JsonStream(io.TextIOWrapper): | |
def readJSON(self, blocksize=DEFAULT_BLOCK_SIZE, encoding='utf-8'): | |
original_file_position = self.tell() | |
decoder = json.JSONDecoder(strict=False) | |
buffer = '' | |
for block in iter(lambda: self.read(blocksize), ''): | |
buffer += block | |
try: | |
result, index = decoder.raw_decode(buffer) | |
self.seek(index - len(buffer), SEEK_CUR) | |
except ValueError as e: pass | |
self.seek(original_file_position) | |
return None | |
def iterJSON(self, blocksize=DEFAULT_BLOCK_SIZE, encoding='utf-8'): | |
blocksize_orig = blocksize | |
decoder = json.JSONDecoder(strict=False) | |
buffer = '' | |
for block in iter(lambda: self.read(blocksize), ''): | |
buffer += block | |
try: | |
while True: | |
result, index = decoder.raw_decode(buffer) | |
buffer = buffer[index:] | |
yield result | |
except ValueError as e: pass | |
def writeJSON(self, object): | |
json.dump(object, self, ensure_ascii=False) | |
def open_stream(filename, mode, *args, **argz): | |
ext = os.path.splitext(filename)[-1].lower() | |
if ext == '.gz': | |
return JsonStream(gzip.GzipFile(filename, mode, *args, **argz), encoding='utf-8') | |
elif ext == '.lzma' or ext == '.lz': | |
return JsonStream(lzma.LZMAFile(filename, mode, *args, **argz), encoding='utf-8') | |
elif ext == '.bz2': | |
return JsonStream(bz2.BZ2File(filename, mode, *args, **argz), encoding='utf-8') | |
else: | |
return JsonStream(open(filename, mode, *args, **argz), encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment