Last active
July 1, 2022 16:18
-
-
Save tsabat/63f36efced3206eaa5514449c0048e62 to your computer and use it in GitHub Desktop.
Breaks a large csv file up into smaller chunks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import csv | |
import argparse | |
from pathlib import Path | |
import chardet | |
from urllib3 import encode_multipart_formdata | |
csv.field_size_limit(sys.maxsize) | |
parser = argparse.ArgumentParser( | |
description="Chunk a large .csv file into smaller files." | |
) | |
parser.add_argument( | |
"--n", | |
type=str, | |
required=True, | |
help='Filename of the .csv file to be chunked (do not need file extension / ".csv")', | |
) | |
parser.add_argument( | |
"--s", type=int, required=True, help="Max number of rows each file should have." | |
) | |
parser.add_argument( | |
"--e", type=str, required=True, help="Encoding: iso-8859-1" | |
) | |
args = parser.parse_args() | |
extension = ".csv" | |
if ".csv" in args.n: | |
extension = "" | |
filename = args.n + extension | |
chunk_size = args.s | |
encoding = args.e | |
part_dir = Path("part") | |
if not part_dir.exists(): | |
part_dir.mkdir() | |
def predict_encoding(file_path: Path, n_lines: int = 200) -> str: | |
"""Predict a file's encoding using chardet""" | |
# Open the file as binary data | |
with Path(file_path).open("rb") as f: | |
# Join binary lines for specified number of lines | |
rawdata = b"".join([f.readline() for _ in range(n_lines)]) | |
return chardet.detect(rawdata)["encoding"] | |
def write_chunk(header, part, rows): | |
with open("part/" + filename + "_part_" + str(part) + ".csv", "w") as f_out: | |
writer = csv.writer(f_out) | |
writer.writerow(header) | |
writer.writerows(rows) | |
from os.path import exists | |
# file_exists = exists() | |
# print("predicting encoding...") | |
# encoding = predict_encoding(Path(filename), 5000) | |
# encoding = "iso-8859-1" | |
# print(f"found encoding: {encoding}") | |
# if | |
rows = [] | |
with open(filename, "r", encoding=encoding) as csvfile: | |
count = 0 | |
filecount = 0 | |
datareader = csv.reader(csvfile) | |
header = next(datareader) | |
file_number = count // chunk_size | |
print("creating files") | |
for row in datareader: | |
count += 1 | |
rows.append(row) | |
if count % chunk_size == 0: | |
file_number = count // chunk_size | |
print(f"creating {file_number}") | |
write_chunk(header, file_number, rows) | |
rows = [] | |
filecount += 1 | |
if len(rows) > 0: | |
print(f"creating last file") | |
write_chunk(header, file_number + 1, rows) | |
filecount += 1 | |
print(str(filecount) + " files created from " + filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment