Skip to content

Instantly share code, notes, and snippets.

@gabefair
Created August 30, 2018 22:00
Show Gist options
  • Save gabefair/26f9ece0ecbe67601db05f1e899e1cdc to your computer and use it in GitHub Desktop.
Save gabefair/26f9ece0ecbe67601db05f1e899e1cdc to your computer and use it in GitHub Desktop.
Splits a giant jsonL file into multipble jsonL files.
# Gabriel Fair
# Please suggest any improvements: http://keybase.io/gabefair
import mmap
import re
import argparse
import sys
import progressbar
from time import sleep
import os
from time import clock
import atexit
from time import time
from datetime import timedelta
from time import perf_counter
#import pdb
current_comment_count = 0
global_comment_count = 0
file_count = 0
comments_per_file = 500000
output_file_contents = '' # The new file will build in RAM before writing to disk. Limiting the number of disk bottlenecks
file_size = 0
bar = progressbar.ProgressBar(maxval=100,widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
def split_json(file_argument):
global output_file_contents
global current_comment_count
last_chunk = False
global json_pattern
global bar
global global_comment_count
global file_size
file_size = os.path.getsize(sys.argv[1])
print("Reading file: " + file_argument + " Splitting every: " + str(comments_per_file) + " comments")
bar = progressbar.ProgressBar(redirect_stdout=True)
bar.start()
lap_time = perf_counter()
with open(file_argument,'r',encoding="utf-8") as file:
for line in file:
#pdb.set_trace()
#if (line == '\x00\n'):
# continue
current_comment_count = current_comment_count + 1
global_comment_count = global_comment_count + 1
output_file_contents = output_file_contents + line
#if (current_comment_count < comments_per_file):
# output_file_contents = output_file_contents + '\n'
if ( current_comment_count % 400 == 0):
write_file(file_argument, 1)
if ( current_comment_count % comments_per_file == 0):
print("Total comments proccessed: "+ str(global_comment_count) + ' and the time since last update: ' + str(timedelta(seconds=perf_counter() - lap_time)))
lap_time = perf_counter()
bar.update(int((global_comment_count/file_size)*100))
if (current_comment_count >= comments_per_file):
write_file(file_argument, 0)
write_file(file_argument, 0)
bar.finish()
print("Bytes successfully read: "+ str(int(file.tell())) + '/' + str(os.path.getsize(file_argument)) + ' ('+ str((file.tell()//os.path.getsize(file_argument))*100) + '%)')
print("Total files: ", file_count)
print("Total comments: ", global_comment_count)
return
def write_file(file_name, leave_open_flag):
global current_comment_count
global file_count
global global_comment_count
global output_file_contents
f = open(file_name + '_%04d' % file_count, 'a')
f.write(output_file_contents)
f.close()
output_file_contents = ''
if(leave_open_flag == 0):
file_count += 1
current_comment_count = 0
def secondsToStr(t):
return str(timedelta(seconds=t))
progress_bar_line = "="*40
def log(s, elapsed=None):
print(progress_bar_line)
print(secondsToStr(time()), '-', s)
if elapsed:
print("Elapsed time:", elapsed)
print(progress_bar_line)
print()
def endlog(start):
end = time()
elapsed = end-start
log("End Program", secondsToStr(elapsed))
def now():
return secondsToStr(time())
def main():
# parser = argparse.ArgumentParser(description='Splits giant file with many JSON objects.')
# parser.add_argument('json file', metavar='F', type=open, help='a file containing valid json' required=True)
# args = parser.parse_args()
start_time = time()
atexit.register(endlog)
log("Start Program")
split_json(sys.argv[1])
endlog(start_time)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment