Skip to content

Instantly share code, notes, and snippets.

@arcanosam
Last active June 29, 2017 00:16
Show Gist options
  • Save arcanosam/103bc6f9245f358631a1612661945851 to your computer and use it in GitHub Desktop.
Save arcanosam/103bc6f9245f358631a1612661945851 to your computer and use it in GitHub Desktop.
Reading chunks of a json file and identifying each json structures as they chunks are read
"""reading from a json file in chunks and added each json structure on a list
"""
import re
import json
# path to a json file
json_file = '<<fill with a complete path to a json file>>'
# this variable hold text read from file until found a json structure
text_read = ''
# every json structure found is added as dict on a list
lst_dct_jsons = []
# this variable hold json structure found
json_founded = ''
def json_from_text(txt):
"""
:param txt: json string read from a json file
:return: one json only structure found
source:https://stackoverflow.com/a/34960703
"""
match = re.findall('{.*?}', txt)
return match[0] if match else None
def read_in_chunks(file_object, chunk_size=100):
"""Lazy function (generator) to read a file piece by piece.
Default chunk size: 100bytes.
source:https://stackoverflow.com/a/519653"""
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
with open(json_file, 'r', encoding='utf-8') as file_h:
chunk = next(read_in_chunks(file_h))
# chunk is concatenated without break lines and [ ]
# with only a single space
text_read = ''.join([
text_read,
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '')
])
# processing all text read until consume it all
while len(text_read) != 0:
# looking for json structures
try:
json_found = json_from_text(text_read)
except ValueError as e:
json_found = None
if json_found is not None:
lst_dct_jsons.append(
json.loads(json_found, encoding='utf-8')
)
# after json structure added
# must remove it from the text readed as commas and spaces
text_read = text_read.replace(json_found, '').replace(',', '', 1).strip()
try:
chunk = next(read_in_chunks(file_h))
except StopIteration as e:
chunk = None
if chunk is not None:
text_read = ''.join([
text_read,
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '')
])
file_h.close()
print('Founded json records/structures: {0}.'.format(len(lst_dct_jsons)))
print('Size of remain text from file(if exists):{0}'.format(len(text_read)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment