Last active
June 29, 2017 00:16
-
-
Save arcanosam/103bc6f9245f358631a1612661945851 to your computer and use it in GitHub Desktop.
Reading chunks of a json file and identifying each json structures as they chunks are read
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""reading from a json file in chunks and added each json structure on a list | |
""" | |
import re | |
import json | |
# path to a json file | |
json_file = '<<fill with a complete path to a json file>>' | |
# this variable hold text read from file until found a json structure | |
text_read = '' | |
# every json structure found is added as dict on a list | |
lst_dct_jsons = [] | |
# this variable hold json structure found | |
json_founded = '' | |
def json_from_text(txt): | |
""" | |
:param txt: json string read from a json file | |
:return: one json only structure found | |
source:https://stackoverflow.com/a/34960703 | |
""" | |
match = re.findall('{.*?}', txt) | |
return match[0] if match else None | |
def read_in_chunks(file_object, chunk_size=100): | |
"""Lazy function (generator) to read a file piece by piece. | |
Default chunk size: 100bytes. | |
source:https://stackoverflow.com/a/519653""" | |
while True: | |
data = file_object.read(chunk_size) | |
if not data: | |
break | |
yield data | |
with open(json_file, 'r', encoding='utf-8') as file_h: | |
chunk = next(read_in_chunks(file_h)) | |
# chunk is concatenated without break lines and [ ] | |
# with only a single space | |
text_read = ''.join([ | |
text_read, | |
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '') | |
]) | |
# processing all text read until consume it all | |
while len(text_read) != 0: | |
# looking for json structures | |
try: | |
json_found = json_from_text(text_read) | |
except ValueError as e: | |
json_found = None | |
if json_found is not None: | |
lst_dct_jsons.append( | |
json.loads(json_found, encoding='utf-8') | |
) | |
# after json structure added | |
# must remove it from the text readed as commas and spaces | |
text_read = text_read.replace(json_found, '').replace(',', '', 1).strip() | |
try: | |
chunk = next(read_in_chunks(file_h)) | |
except StopIteration as e: | |
chunk = None | |
if chunk is not None: | |
text_read = ''.join([ | |
text_read, | |
chunk.replace("\n", '').replace(' ', '').replace('[', '').replace(']', '') | |
]) | |
file_h.close() | |
print('Founded json records/structures: {0}.'.format(len(lst_dct_jsons))) | |
print('Size of remain text from file(if exists):{0}'.format(len(text_read))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment