Created
April 18, 2018 14:25
-
-
Save hpiwowar/032a29be17becfebe17831ab0c4b5bf9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging | |
import gzip | |
import time | |
import os.path | |
import json | |
# this is the compressed file size | |
# looks like the gzip format only defines the uncompressed file size modulo 2^32 | |
file_size = os.path.getsize(filename) | |
# NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far | |
# this is a little hacky, not public api of the gzip module | |
t0 = t_log = time.time() | |
with gzip.open(filename, 'rb') as f: | |
for line_number, line_content in enumerate(f): # read line by line | |
line_content = line_content.replace('\\\\', '\\') | |
if line_content=='\N': | |
continue | |
if log_period_s and time.time() > t_log + log_period_s: | |
t_log = t_log + log_period_s | |
logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0)) | |
try: | |
yield json.loads( line_content ) | |
except ValueError: | |
logging.error('line_number={}, line_content={}'.format(line_number, line_content)) | |
then: | |
for item in get_row_iterator_jsonl_gz(filename): | |
... do something with item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment