Skip to content

Instantly share code, notes, and snippets.

@jacobwegner
Created July 9, 2025 15:34
Show Gist options
  • Save jacobwegner/37773ce6a149407811a621ee08bcaf4c to your computer and use it in GitHub Desktop.
Save jacobwegner/37773ce6a149407811a621ee08bcaf4c to your computer and use it in GitHub Desktop.
Script to split JSONL by size
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
MAX_SIZE = 25 * 1024 * 1024 # 25 MB
def split_jsonl_by_size(file_path: Path):
base = file_path.stem
ext = file_path.suffix
dir_ = file_path.parent
part_num = 1
current_size = 0
current_lines = []
def write_chunk():
nonlocal part_num, current_lines
chunk_path = dir_ / f"{base}-{part_num:03d}{ext}"
with chunk_path.open('w', encoding='utf-8') as f:
f.writelines(current_lines)
part_num += 1
current_lines = []
try:
with file_path.open('r', encoding='utf-8') as infile:
for line in infile:
line_size = len(line.encode('utf-8'))
if current_size + line_size > MAX_SIZE and current_lines:
write_chunk()
current_size = 0
current_lines.append(line)
current_size += line_size
if current_lines:
write_chunk()
file_path.unlink() # Delete original file
print(f"✅ Finished: {file_path}")
except Exception as e:
print(f"❌ Error processing {file_path}: {e}")
def process_directory_parallel(root_dir: Path, max_workers=4):
jsonl_files = list(root_dir.rglob("*.jsonl"))
if not jsonl_files:
print("No .jsonl files found.")
return
with ThreadPoolExecutor(max_workers=max_workers) as executor:
executor.map(split_jsonl_by_size, jsonl_files)
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python split_jsonl_parallel.py <directory>")
sys.exit(1)
root_directory = Path(sys.argv[1])
if not root_directory.is_dir():
print(f"Error: {root_directory} is not a directory")
sys.exit(1)
process_directory_parallel(root_directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment