Skip to content

Instantly share code, notes, and snippets.

@KristobalJunta
Created June 1, 2021 14:57
Show Gist options
  • Save KristobalJunta/b9ae3ab3d9a4bd365c6f575734672610 to your computer and use it in GitHub Desktop.
Save KristobalJunta/b9ae3ab3d9a4bd365c6f575734672610 to your computer and use it in GitHub Desktop.
Fast (probably) splitting of csv files by number of rows (accounts for linebreaks inside cell values)
#!/usr/bin/env python2
import os
import sys
def split(filehandler, delimiter=',', row_limit=700000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
if __name__ == '__main__':
with open(sys.argv[1]) as f:
split(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment