gabefair · August 30, 2018 22:00
diff --git a/split_json-lines_block.py b/split_json-lines_block.py
 # Gabriel Fair
 # Please suggest any improvements: http://keybase.io/gabefair
 import mmap
 import re
 import argparse
 import sys
 import progressbar
 from time import sleep
 import os
 from time import clock
 import atexit
 from time import time
 from datetime import timedelta
 from time import perf_counter
 #import pdb


 current_comment_count = 0
 global_comment_count = 0
 file_count = 0
 comments_per_file = 500000
 output_file_contents = ''  # The new file will build in RAM before writing to disk. Limiting the number of disk bottlenecks
 file_size = 0
 bar = progressbar.ProgressBar(maxval=100,widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

 def split_json(file_argument):
 	global output_file_contents
 	global current_comment_count
 	last_chunk = False
 	global json_pattern
 	global bar
 	global global_comment_count
 	global file_size
 	
 	file_size = os.path.getsize(sys.argv[1])

 	print("Reading file: " + file_argument + " Splitting every: " + str(comments_per_file) + " comments")
 	bar = progressbar.ProgressBar(redirect_stdout=True)
 	bar.start()
 	lap_time = perf_counter()
 	with open(file_argument,'r',encoding="utf-8") as file:
 		for line in file:
 			#pdb.set_trace()
 			#if (line == '\x00\n'):
 			#	continue
 			current_comment_count = current_comment_count + 1
 			global_comment_count = global_comment_count + 1
 			output_file_contents = output_file_contents + line			
 			#if (current_comment_count < comments_per_file):
 			#	output_file_contents = output_file_contents + '\n'
 			if ( current_comment_count % 400 == 0):
 				write_file(file_argument, 1)
 			if ( current_comment_count % comments_per_file == 0):
 				print("Total comments proccessed: "+ str(global_comment_count) + ' and the time since last update: ' + str(timedelta(seconds=perf_counter() - lap_time)))
 				lap_time = perf_counter()
 				bar.update(int((global_comment_count/file_size)*100))
 			if (current_comment_count >= comments_per_file):
 				write_file(file_argument, 0)
 				
 	write_file(file_argument, 0)			
 	bar.finish()
 	print("Bytes successfully read:  "+ str(int(file.tell())) + '/' + str(os.path.getsize(file_argument)) + ' ('+ str((file.tell()//os.path.getsize(file_argument))*100) + '%)')
 	print("Total files: ", file_count)
 	print("Total comments: ", global_comment_count)
 	return

 def write_file(file_name, leave_open_flag):
 	global current_comment_count
 	global file_count
 	global global_comment_count
 	global output_file_contents
 	
 	f = open(file_name + '_%04d' % file_count, 'a')
 	f.write(output_file_contents)
 	f.close()
 	output_file_contents = ''
 	if(leave_open_flag == 0):
 		file_count += 1
 		current_comment_count = 0
 	
 def secondsToStr(t):
    return str(timedelta(seconds=t))

 progress_bar_line = "="*40
 def log(s, elapsed=None):
    print(progress_bar_line)
    print(secondsToStr(time()), '-', s)
    if elapsed:
        print("Elapsed time:", elapsed)
    print(progress_bar_line)
    print()

 def endlog(start):
    end = time()
    elapsed = end-start
    log("End Program", secondsToStr(elapsed))

 def now():
    return secondsToStr(time())


 def main():
    # parser = argparse.ArgumentParser(description='Splits giant file with many JSON objects.')
 	# parser.add_argument('json file', metavar='F', type=open, help='a file containing valid json' required=True)
 	# args = parser.parse_args()
 	start_time = time()
 	atexit.register(endlog)
 	log("Start Program")
 	split_json(sys.argv[1])
 	endlog(start_time)


 if __name__ == '__main__':
 	main()
	# Gabriel Fair
	# Please suggest any improvements: http://keybase.io/gabefair
	import mmap
	import re
	import argparse
	import sys
	import progressbar
	from time import sleep
	import os
	from time import clock
	import atexit
	from time import time
	from datetime import timedelta
	from time import perf_counter
	#import pdb


	current_comment_count = 0
	global_comment_count = 0
	file_count = 0
	comments_per_file = 500000
	output_file_contents = '' # The new file will build in RAM before writing to disk. Limiting the number of disk bottlenecks
	file_size = 0
	bar = progressbar.ProgressBar(maxval=100,widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

	def split_json(file_argument):
	global output_file_contents
	global current_comment_count
	last_chunk = False
	global json_pattern
	global bar
	global global_comment_count
	global file_size

	file_size = os.path.getsize(sys.argv[1])

	print("Reading file: " + file_argument + " Splitting every: " + str(comments_per_file) + " comments")
	bar = progressbar.ProgressBar(redirect_stdout=True)
	bar.start()
	lap_time = perf_counter()
	with open(file_argument,'r',encoding="utf-8") as file:
	for line in file:
	#pdb.set_trace()
	#if (line == '\x00\n'):
	# continue
	current_comment_count = current_comment_count + 1
	global_comment_count = global_comment_count + 1
	output_file_contents = output_file_contents + line
	#if (current_comment_count < comments_per_file):
	# output_file_contents = output_file_contents + '\n'
	if ( current_comment_count % 400 == 0):
	write_file(file_argument, 1)
	if ( current_comment_count % comments_per_file == 0):
	print("Total comments proccessed: "+ str(global_comment_count) + ' and the time since last update: ' + str(timedelta(seconds=perf_counter() - lap_time)))
	lap_time = perf_counter()
	bar.update(int((global_comment_count/file_size)*100))
	if (current_comment_count >= comments_per_file):
	write_file(file_argument, 0)

	write_file(file_argument, 0)
	bar.finish()
	print("Bytes successfully read: "+ str(int(file.tell())) + '/' + str(os.path.getsize(file_argument)) + ' ('+ str((file.tell()//os.path.getsize(file_argument))*100) + '%)')
	print("Total files: ", file_count)
	print("Total comments: ", global_comment_count)
	return

	def write_file(file_name, leave_open_flag):
	global current_comment_count
	global file_count
	global global_comment_count
	global output_file_contents

	f = open(file_name + '_%04d' % file_count, 'a')
	f.write(output_file_contents)
	f.close()
	output_file_contents = ''
	if(leave_open_flag == 0):
	file_count += 1
	current_comment_count = 0

	def secondsToStr(t):
	return str(timedelta(seconds=t))

	progress_bar_line = "="*40
	def log(s, elapsed=None):
	print(progress_bar_line)
	print(secondsToStr(time()), '-', s)
	if elapsed:
	print("Elapsed time:", elapsed)
	print(progress_bar_line)
	print()

	def endlog(start):
	end = time()
	elapsed = end-start
	log("End Program", secondsToStr(elapsed))

	def now():
	return secondsToStr(time())


	def main():
	# parser = argparse.ArgumentParser(description='Splits giant file with many JSON objects.')
	# parser.add_argument('json file', metavar='F', type=open, help='a file containing valid json' required=True)
	# args = parser.parse_args()
	start_time = time()
	atexit.register(endlog)
	log("Start Program")
	split_json(sys.argv[1])
	endlog(start_time)


	if __name__ == '__main__':
	main()