jacobwegner · July 9, 2025 15:34
diff --git a/split_jsonl_parallel.py b/split_jsonl_parallel.py
 import os
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor

 MAX_SIZE = 25 * 1024 * 1024  # 25 MB

 def split_jsonl_by_size(file_path: Path):
    base = file_path.stem
    ext = file_path.suffix
    dir_ = file_path.parent

    part_num = 1
    current_size = 0
    current_lines = []

    def write_chunk():
        nonlocal part_num, current_lines
        chunk_path = dir_ / f"{base}-{part_num:03d}{ext}"
        with chunk_path.open('w', encoding='utf-8') as f:
            f.writelines(current_lines)
        part_num += 1
        current_lines = []

    try:
        with file_path.open('r', encoding='utf-8') as infile:
            for line in infile:
                line_size = len(line.encode('utf-8'))
                if current_size + line_size > MAX_SIZE and current_lines:
                    write_chunk()
                    current_size = 0
                current_lines.append(line)
                current_size += line_size

            if current_lines:
                write_chunk()

        file_path.unlink()  # Delete original file
        print(f"✅ Finished: {file_path}")
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


 def process_directory_parallel(root_dir: Path, max_workers=4):
    jsonl_files = list(root_dir.rglob("*.jsonl"))

    if not jsonl_files:
        print("No .jsonl files found.")
        return

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(split_jsonl_by_size, jsonl_files)


 if __name__ == "__main__":
    import sys

    if len(sys.argv) != 2:
        print("Usage: python split_jsonl_parallel.py <directory>")
        sys.exit(1)

    root_directory = Path(sys.argv[1])
    if not root_directory.is_dir():
        print(f"Error: {root_directory} is not a directory")
        sys.exit(1)

    process_directory_parallel(root_directory)
	import os
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor

	MAX_SIZE = 25 * 1024 * 1024 # 25 MB

	def split_jsonl_by_size(file_path: Path):
	base = file_path.stem
	ext = file_path.suffix
	dir_ = file_path.parent

	part_num = 1
	current_size = 0
	current_lines = []

	def write_chunk():
	nonlocal part_num, current_lines
	chunk_path = dir_ / f"{base}-{part_num:03d}{ext}"
	with chunk_path.open('w', encoding='utf-8') as f:
	f.writelines(current_lines)
	part_num += 1
	current_lines = []

	try:
	with file_path.open('r', encoding='utf-8') as infile:
	for line in infile:
	line_size = len(line.encode('utf-8'))
	if current_size + line_size > MAX_SIZE and current_lines:
	write_chunk()
	current_size = 0
	current_lines.append(line)
	current_size += line_size

	if current_lines:
	write_chunk()

	file_path.unlink() # Delete original file
	print(f"✅ Finished: {file_path}")
	except Exception as e:
	print(f"❌ Error processing {file_path}: {e}")


	def process_directory_parallel(root_dir: Path, max_workers=4):
	jsonl_files = list(root_dir.rglob("*.jsonl"))

	if not jsonl_files:
	print("No .jsonl files found.")
	return

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	executor.map(split_jsonl_by_size, jsonl_files)


	if __name__ == "__main__":
	import sys

	if len(sys.argv) != 2:
	print("Usage: python split_jsonl_parallel.py <directory>")
	sys.exit(1)

	root_directory = Path(sys.argv[1])
	if not root_directory.is_dir():
	print(f"Error: {root_directory} is not a directory")
	sys.exit(1)

	process_directory_parallel(root_directory)