Created
June 1, 2021 14:57
-
-
Save KristobalJunta/b9ae3ab3d9a4bd365c6f575734672610 to your computer and use it in GitHub Desktop.
Fast (probably) splitting of csv files by number of rows (accounts for linebreaks inside cell values)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import os | |
import sys | |
def split(filehandler, delimiter=',', row_limit=700000, | |
output_name_template='output_%s.csv', output_path='.', keep_headers=True): | |
import csv | |
reader = csv.reader(filehandler, delimiter=delimiter) | |
current_piece = 1 | |
current_out_path = os.path.join( | |
output_path, | |
output_name_template % current_piece | |
) | |
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
current_limit = row_limit | |
if keep_headers: | |
headers = reader.next() | |
current_out_writer.writerow(headers) | |
for i, row in enumerate(reader): | |
if i + 1 > current_limit: | |
current_piece += 1 | |
current_limit = row_limit * current_piece | |
current_out_path = os.path.join( | |
output_path, | |
output_name_template % current_piece | |
) | |
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
if keep_headers: | |
current_out_writer.writerow(headers) | |
current_out_writer.writerow(row) | |
if __name__ == '__main__': | |
with open(sys.argv[1]) as f: | |
split(f) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment