Skip to content

Instantly share code, notes, and snippets.

@KoStard
Created July 29, 2022 10:07
Show Gist options
  • Save KoStard/72283408c8ff239de49da144f6ec0f7d to your computer and use it in GitHub Desktop.
Save KoStard/72283408c8ff239de49da144f6ec0f7d to your computer and use it in GitHub Desktop.
Split different kinds of CSV files - configure the delimiter, number of lines in each file, etc
from os import mkdir
import pandas as pd
import sys
import argparse
import pathlib
parser = argparse.ArgumentParser(description="Split CSV documents")
parser.add_argument('input_filepath',
help="The input csv file you want to split")
parser.add_argument(
'target_folder_path',
help="The output folder where the splitted files will be put")
parser.add_argument('-d',
'--delimiter',
default=',',
help="The delimiter used inside the CSV file")
parser.add_argument('-hr',
'--header',
type=bool,
default=True,
help="Should treat the first column as header")
parser.add_argument('-l',
'--lines',
type=int,
default=10000,
help="Number of lines in each file")
args = parser.parse_args()
input_filepath = args.input_filepath
target_folder_path = args.target_folder_path
delimiter = args.delimiter
header = args.header
lines = args.lines
df = pd.read_csv(input_filepath,
delimiter=delimiter,
header=0 if header else None)
try:
mkdir(target_folder_path)
except FileExistsError:
print("The output folder already exists, continuing...")
input_stem = pathlib.Path(input_filepath).stem
output_folder = pathlib.Path(target_folder_path)
for i in range(0, len(df), lines):
sheet = df.loc[i:i + lines]
sheet.to_csv(output_folder / f'{input_stem}_{i}_to_{i+lines}.csv',
header=header,
sep=delimiter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment