Skip to content

Instantly share code, notes, and snippets.

@hpiwowar
Created April 18, 2018 14:25
Show Gist options
  • Save hpiwowar/032a29be17becfebe17831ab0c4b5bf9 to your computer and use it in GitHub Desktop.
Save hpiwowar/032a29be17becfebe17831ab0c4b5bf9 to your computer and use it in GitHub Desktop.
def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
import gzip
import time
import os.path
import json
# this is the compressed file size
# looks like the gzip format only defines the uncompressed file size modulo 2^32
file_size = os.path.getsize(filename)
# NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
# this is a little hacky, not public api of the gzip module
t0 = t_log = time.time()
with gzip.open(filename, 'rb') as f:
for line_number, line_content in enumerate(f): # read line by line
line_content = line_content.replace('\\\\', '\\')
if line_content=='\N':
continue
if log_period_s and time.time() > t_log + log_period_s:
t_log = t_log + log_period_s
logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))
try:
yield json.loads( line_content )
except ValueError:
logging.error('line_number={}, line_content={}'.format(line_number, line_content))
then:
for item in get_row_iterator_jsonl_gz(filename):
... do something with item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment