Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created September 7, 2025 22:05
Show Gist options
  • Save yeiichi/3eb8743cdc071a6316769ab4638465f1 to your computer and use it in GitHub Desktop.
Save yeiichi/3eb8743cdc071a6316769ab4638465f1 to your computer and use it in GitHub Desktop.
Classify CSV files by strict header match and move them to mapped destinations
#!/usr/bin/env python3
"""
A tool for classifying and moving CSV files based on column headers.
This script processes CSV files by matching their headers against predefined
layouts. Depending on the matched layout, the files are moved to their
corresponding destination directories. The script supports both file-based
and directory-based classification and offers options for dry-run execution,
recursion, and verbose output.
It uses pandas for reading CSV files and argparse for command-line parsing.
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Dict, Iterable, Tuple
import sys
import pandas as pd
# --- Configuration ------------------------------------------------------------
# Example canonical column layouts (strict match: names + order)
COL_TYPES: list[list[str]] = [
["id", "name", "value"],
["timestamp", "user", "event"],
["a", "b", "c"],
]
# Destination directories
DST_A = Path("/path/to/A")
DST_B = Path("/path/to/B")
DST_C = Path("/path/to/C")
# Strict mapping of column layouts → destination dirs
COL_TO_DST: Dict[Tuple[str, ...], Path] = {
tuple(COL_TYPES[0]): DST_A,
tuple(COL_TYPES[1]): DST_B,
tuple(COL_TYPES[2]): DST_C,
}
# --- Helpers -----------------------------------------------------------------
def normalize_cols(df: pd.DataFrame) -> Tuple[str, ...]:
return tuple(df.columns.tolist())
def ensure_dir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def safe_move(src: Path, dst_dir: Path) -> Path:
"""
Move `src` into `dst_dir`, avoiding overwrite by adding a numeric suffix
if a file with the same name exists. Returns the final path.
"""
ensure_dir(dst_dir)
target = dst_dir / src.name
if not target.exists():
return src.rename(target)
stem, suffix = target.stem, target.suffix
i = 1
while True:
candidate = target.with_name(f"{stem}.{i}{suffix}")
if not candidate.exists():
return src.rename(candidate)
i += 1
# --- Core --------------------------------------------------------------------
class UnrecognizedColumnsError(ValueError):
pass
def classify_csv(csv_path: Path, *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> Path:
"""
Read a CSV, strictly match its header against known layouts, and move it
into the mapped destination directory.
Returns: final (or intended, if dry_run) destination path.
"""
if not csv_path.exists():
raise FileNotFoundError(csv_path)
df = pd.read_csv(csv_path, **read_csv_kwargs)
cols = normalize_cols(df)
try:
dst_dir = COL_TO_DST[cols]
except KeyError:
raise UnrecognizedColumnsError(f"Unrecognized columns: {cols}")
intended = dst_dir / csv_path.name
if dry_run:
if verbose:
print(f"[DRY-RUN] {csv_path} -> {intended}")
return intended
moved = safe_move(csv_path, dst_dir)
if verbose:
print(f"[MOVED] {csv_path} -> {moved}")
return moved
def classify_many(paths: Iterable[Path], *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> dict[Path, Path]:
result: dict[Path, Path] = {}
for p in paths:
result[p] = classify_csv(p, dry_run=dry_run, verbose=verbose, **read_csv_kwargs)
return result
# --- CLI ---------------------------------------------------------------------
def parse_args(argv: list[str]) -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Classify CSV files by strict header match and move them to mapped destinations."
)
src = p.add_mutually_exclusive_group(required=True)
src.add_argument("--csv", type=Path, nargs="+",
help="One or more CSV file paths to classify.")
src.add_argument("--dir", type=Path,
help="Directory to scan for CSV files (use with --glob).")
p.add_argument("--glob", default="*.csv",
help="Glob pattern when using --dir (default: *.csv).")
p.add_argument("--recursive", action="store_true",
help="Recurse into subdirectories when using --dir.")
p.add_argument("--dry-run", action="store_true",
help="Show intended moves without renaming files.")
p.add_argument("--encoding", default=None,
help="Encoding passed to pandas.read_csv (e.g., utf-8, utf-8-sig).")
p.add_argument("--delimiter", "--sep", dest="sep", default=None,
help="Field delimiter passed to pandas.read_csv.")
p.add_argument("--verbose", "-v", action="store_true",
help="Verbose output.")
return p.parse_args(argv)
def gather_paths(args: argparse.Namespace) -> list[Path]:
if args.csv:
return [p for p in args.csv if p.is_file()]
if args.dir:
if args.recursive:
return [p for p in args.dir.rglob(args.glob) if p.is_file()]
return [p for p in args.dir.glob(args.glob) if p.is_file()]
return []
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv or sys.argv[1:])
files = gather_paths(args)
if not files:
print("No CSV files found.", file=sys.stderr)
return 2
# pandas.read_csv kwargs
read_kwargs = {}
if args.encoding:
read_kwargs["encoding"] = args.encoding
if args.sep:
read_kwargs["sep"] = args.sep
try:
classify_many(files, dry_run=args.dry_run, verbose=args.verbose, **read_kwargs)
except UnrecognizedColumnsError as e:
print(f"Error: {e}", file=sys.stderr)
return 3
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
return 4
except pd.errors.ParserError as e:
print(f"CSV parse error: {e}", file=sys.stderr)
return 5
return 0
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment