hpiwowar · April 18, 2018 14:25
diff --git a/gistfile1.txt b/gistfile1.txt
 def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
    import gzip
    import time
    import os.path
    import json
        

    # this is the compressed file size
    # looks like the gzip format only defines the uncompressed file size modulo 2^32
    file_size = os.path.getsize(filename)  
    

    # NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
    # this is a little hacky, not public api of the gzip module

    t0 = t_log = time.time()

    with gzip.open(filename, 'rb') as f:
        for line_number, line_content in enumerate(f): # read line by line
            

            line_content = line_content.replace('\\\\', '\\')
            if line_content=='\N':
                continue
            

            if log_period_s and time.time() > t_log + log_period_s:
                t_log = t_log + log_period_s
                logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))
            

            try:
                yield json.loads( line_content )
            except ValueError:
                logging.error('line_number={}, line_content={}'.format(line_number, line_content))

 then:

 for item in get_row_iterator_jsonl_gz(filename):
    ... do something with item
	def get_row_iterator_jsonl_gz(filename, log_period_s=5): # log_period_s=None for no logging
	import gzip
	import time
	import os.path
	import json


	# this is the compressed file size
	# looks like the gzip format only defines the uncompressed file size modulo 2^32
	file_size = os.path.getsize(filename)


	# NOTE: we will use f.fileobj.tell() to get the number of compressed bytes read so far
	# this is a little hacky, not public api of the gzip module

	t0 = t_log = time.time()

	with gzip.open(filename, 'rb') as f:
	for line_number, line_content in enumerate(f): # read line by line


	line_content = line_content.replace('\\\\', '\\')
	if line_content=='\N':
	continue


	if log_period_s and time.time() > t_log + log_period_s:
	t_log = t_log + log_period_s
	logging.info('{:,} / {:,} ({:0.1f}% in {:0.1f}s)'.format(f.fileobj.tell(), file_size, 100.0*float(f.fileobj.tell())/file_size, time.time()-t0))


	try:
	yield json.loads( line_content )
	except ValueError:
	logging.error('line_number={}, line_content={}'.format(line_number, line_content))

	then:

	for item in get_row_iterator_jsonl_gz(filename):
	... do something with item