Created
September 7, 2025 22:05
-
-
Save yeiichi/3eb8743cdc071a6316769ab4638465f1 to your computer and use it in GitHub Desktop.
Classify CSV files by strict header match and move them to mapped destinations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
A tool for classifying and moving CSV files based on column headers. | |
This script processes CSV files by matching their headers against predefined | |
layouts. Depending on the matched layout, the files are moved to their | |
corresponding destination directories. The script supports both file-based | |
and directory-based classification and offers options for dry-run execution, | |
recursion, and verbose output. | |
It uses pandas for reading CSV files and argparse for command-line parsing. | |
""" | |
from __future__ import annotations | |
import argparse | |
from pathlib import Path | |
from typing import Dict, Iterable, Tuple | |
import sys | |
import pandas as pd | |
# --- Configuration ------------------------------------------------------------ | |
# Example canonical column layouts (strict match: names + order) | |
COL_TYPES: list[list[str]] = [ | |
["id", "name", "value"], | |
["timestamp", "user", "event"], | |
["a", "b", "c"], | |
] | |
# Destination directories | |
DST_A = Path("/path/to/A") | |
DST_B = Path("/path/to/B") | |
DST_C = Path("/path/to/C") | |
# Strict mapping of column layouts → destination dirs | |
COL_TO_DST: Dict[Tuple[str, ...], Path] = { | |
tuple(COL_TYPES[0]): DST_A, | |
tuple(COL_TYPES[1]): DST_B, | |
tuple(COL_TYPES[2]): DST_C, | |
} | |
# --- Helpers ----------------------------------------------------------------- | |
def normalize_cols(df: pd.DataFrame) -> Tuple[str, ...]: | |
return tuple(df.columns.tolist()) | |
def ensure_dir(p: Path) -> None: | |
p.mkdir(parents=True, exist_ok=True) | |
def safe_move(src: Path, dst_dir: Path) -> Path: | |
""" | |
Move `src` into `dst_dir`, avoiding overwrite by adding a numeric suffix | |
if a file with the same name exists. Returns the final path. | |
""" | |
ensure_dir(dst_dir) | |
target = dst_dir / src.name | |
if not target.exists(): | |
return src.rename(target) | |
stem, suffix = target.stem, target.suffix | |
i = 1 | |
while True: | |
candidate = target.with_name(f"{stem}.{i}{suffix}") | |
if not candidate.exists(): | |
return src.rename(candidate) | |
i += 1 | |
# --- Core -------------------------------------------------------------------- | |
class UnrecognizedColumnsError(ValueError): | |
pass | |
def classify_csv(csv_path: Path, *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> Path: | |
""" | |
Read a CSV, strictly match its header against known layouts, and move it | |
into the mapped destination directory. | |
Returns: final (or intended, if dry_run) destination path. | |
""" | |
if not csv_path.exists(): | |
raise FileNotFoundError(csv_path) | |
df = pd.read_csv(csv_path, **read_csv_kwargs) | |
cols = normalize_cols(df) | |
try: | |
dst_dir = COL_TO_DST[cols] | |
except KeyError: | |
raise UnrecognizedColumnsError(f"Unrecognized columns: {cols}") | |
intended = dst_dir / csv_path.name | |
if dry_run: | |
if verbose: | |
print(f"[DRY-RUN] {csv_path} -> {intended}") | |
return intended | |
moved = safe_move(csv_path, dst_dir) | |
if verbose: | |
print(f"[MOVED] {csv_path} -> {moved}") | |
return moved | |
def classify_many(paths: Iterable[Path], *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> dict[Path, Path]: | |
result: dict[Path, Path] = {} | |
for p in paths: | |
result[p] = classify_csv(p, dry_run=dry_run, verbose=verbose, **read_csv_kwargs) | |
return result | |
# --- CLI --------------------------------------------------------------------- | |
def parse_args(argv: list[str]) -> argparse.Namespace: | |
p = argparse.ArgumentParser( | |
description="Classify CSV files by strict header match and move them to mapped destinations." | |
) | |
src = p.add_mutually_exclusive_group(required=True) | |
src.add_argument("--csv", type=Path, nargs="+", | |
help="One or more CSV file paths to classify.") | |
src.add_argument("--dir", type=Path, | |
help="Directory to scan for CSV files (use with --glob).") | |
p.add_argument("--glob", default="*.csv", | |
help="Glob pattern when using --dir (default: *.csv).") | |
p.add_argument("--recursive", action="store_true", | |
help="Recurse into subdirectories when using --dir.") | |
p.add_argument("--dry-run", action="store_true", | |
help="Show intended moves without renaming files.") | |
p.add_argument("--encoding", default=None, | |
help="Encoding passed to pandas.read_csv (e.g., utf-8, utf-8-sig).") | |
p.add_argument("--delimiter", "--sep", dest="sep", default=None, | |
help="Field delimiter passed to pandas.read_csv.") | |
p.add_argument("--verbose", "-v", action="store_true", | |
help="Verbose output.") | |
return p.parse_args(argv) | |
def gather_paths(args: argparse.Namespace) -> list[Path]: | |
if args.csv: | |
return [p for p in args.csv if p.is_file()] | |
if args.dir: | |
if args.recursive: | |
return [p for p in args.dir.rglob(args.glob) if p.is_file()] | |
return [p for p in args.dir.glob(args.glob) if p.is_file()] | |
return [] | |
def main(argv: list[str] | None = None) -> int: | |
args = parse_args(argv or sys.argv[1:]) | |
files = gather_paths(args) | |
if not files: | |
print("No CSV files found.", file=sys.stderr) | |
return 2 | |
# pandas.read_csv kwargs | |
read_kwargs = {} | |
if args.encoding: | |
read_kwargs["encoding"] = args.encoding | |
if args.sep: | |
read_kwargs["sep"] = args.sep | |
try: | |
classify_many(files, dry_run=args.dry_run, verbose=args.verbose, **read_kwargs) | |
except UnrecognizedColumnsError as e: | |
print(f"Error: {e}", file=sys.stderr) | |
return 3 | |
except FileNotFoundError as e: | |
print(f"Error: {e}", file=sys.stderr) | |
return 4 | |
except pd.errors.ParserError as e: | |
print(f"CSV parse error: {e}", file=sys.stderr) | |
return 5 | |
return 0 | |
if __name__ == "__main__": | |
raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment