Created
August 5, 2020 10:15
-
-
Save aSipiere/50142b8d731f1e6c9b3144c7e1877363 to your computer and use it in GitHub Desktop.
A python 3.8 update of: https://gist.github.com/jrivero/1085501 with argparse and tqdm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import argparse | |
from tqdm import tqdm | |
def split(filehandler, delimiter=',', row_limit=10000, | |
output_name_template='output_%s.csv', output_path='.', keep_headers=True): | |
""" | |
Splits a CSV file into multiple pieces. | |
A quick bastardization of the Python CSV library. | |
Arguments: | |
`row_limit`: The number of rows you want in each output file. 10,000 by default. | |
`output_name_template`: A %s-style template for the numbered output files. | |
`output_path`: Where to stick the output files. | |
`keep_headers`: Whether or not to print the headers in each output file. | |
Example usage: | |
>> from toolbox import csv_splitter; | |
>> csv_splitter.split(open('/home/ben/input.csv', 'r')); | |
""" | |
reader = csv.reader(filehandler, delimiter=delimiter) | |
current_piece = 1 | |
current_out_path = os.path.join( | |
output_path, | |
f"{output_name_template}{current_piece}.csv" | |
) | |
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
current_limit = row_limit | |
if keep_headers: | |
headers = next(reader) | |
current_out_writer.writerow(headers) | |
for i, row in tqdm(enumerate(reader), total=142168021): | |
if i + 1 > current_limit: | |
current_piece += 1 | |
current_limit = row_limit * current_piece | |
current_out_path = os.path.join( | |
output_path, | |
f"{output_name_template}{current_piece}.csv" | |
) | |
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) | |
if keep_headers: | |
current_out_writer.writerow(headers) | |
current_out_writer.writerow(row) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--infile", "-I", help="input csv path") | |
parser.add_argument("--outpath", "-O", nargs='?', default='.', type=str, help="Where to stick the output files.") | |
parser.add_argument("--delimiter", "-D", nargs='?', default=',', type=str, help="delimiter") | |
parser.add_argument("--rows", "-R", nargs='?', default=10000, type=int, help="number of rows") | |
parser.add_argument("--name", "-N", nargs='?', default='output_', type=str, help="A %s-style template for the numbered output files") | |
args = parser.parse_args() | |
with open(args.infile) as csvfile: | |
split( | |
csvfile, | |
delimiter=args.delimiter, | |
output_name_template=args.name, | |
row_limit=args.rows, | |
output_path=args.outpath, | |
keep_headers=True | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just a few notes, there's a hardcoded value on the tqdm loop that i got from
wc -l
of the file which is the number of new line characters, and the keep headers is true by default because I forgot to make it a paser arg.