yeiichi · September 7, 2025 22:05
diff --git a/csv_classifier.py b/csv_classifier.py
 #!/usr/bin/env python3
 """
 A tool for classifying and moving CSV files based on column headers.

 This script processes CSV files by matching their headers against predefined
 layouts. Depending on the matched layout, the files are moved to their
 corresponding destination directories. The script supports both file-based
 and directory-based classification and offers options for dry-run execution,
 recursion, and verbose output.

 It uses pandas for reading CSV files and argparse for command-line parsing.
 """
 from __future__ import annotations

 import argparse
 from pathlib import Path
 from typing import Dict, Iterable, Tuple
 import sys
 import pandas as pd

 # --- Configuration ------------------------------------------------------------

 # Example canonical column layouts (strict match: names + order)
 COL_TYPES: list[list[str]] = [
    ["id", "name", "value"],
    ["timestamp", "user", "event"],
    ["a", "b", "c"],
 ]

 # Destination directories
 DST_A = Path("/path/to/A")
 DST_B = Path("/path/to/B")
 DST_C = Path("/path/to/C")

 # Strict mapping of column layouts → destination dirs
 COL_TO_DST: Dict[Tuple[str, ...], Path] = {
    tuple(COL_TYPES[0]): DST_A,
    tuple(COL_TYPES[1]): DST_B,
    tuple(COL_TYPES[2]): DST_C,
 }

 # --- Helpers -----------------------------------------------------------------

 def normalize_cols(df: pd.DataFrame) -> Tuple[str, ...]:
    return tuple(df.columns.tolist())

 def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

 def safe_move(src: Path, dst_dir: Path) -> Path:
    """
    Move `src` into `dst_dir`, avoiding overwrite by adding a numeric suffix
    if a file with the same name exists. Returns the final path.
    """
    ensure_dir(dst_dir)
    target = dst_dir / src.name
    if not target.exists():
        return src.rename(target)
    stem, suffix = target.stem, target.suffix
    i = 1
    while True:
        candidate = target.with_name(f"{stem}.{i}{suffix}")
        if not candidate.exists():
            return src.rename(candidate)
        i += 1

 # --- Core --------------------------------------------------------------------

 class UnrecognizedColumnsError(ValueError):
    pass

 def classify_csv(csv_path: Path, *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> Path:
    """
    Read a CSV, strictly match its header against known layouts, and move it
    into the mapped destination directory.

    Returns: final (or intended, if dry_run) destination path.
    """
    if not csv_path.exists():
        raise FileNotFoundError(csv_path)

    df = pd.read_csv(csv_path, **read_csv_kwargs)
    cols = normalize_cols(df)

    try:
        dst_dir = COL_TO_DST[cols]
    except KeyError:
        raise UnrecognizedColumnsError(f"Unrecognized columns: {cols}")

    intended = dst_dir / csv_path.name
    if dry_run:
        if verbose:
            print(f"[DRY-RUN] {csv_path} -> {intended}")
        return intended

    moved = safe_move(csv_path, dst_dir)
    if verbose:
        print(f"[MOVED]   {csv_path} -> {moved}")
    return moved

 def classify_many(paths: Iterable[Path], *, dry_run: bool = False, verbose: bool = False, **read_csv_kwargs) -> dict[Path, Path]:
    result: dict[Path, Path] = {}
    for p in paths:
        result[p] = classify_csv(p, dry_run=dry_run, verbose=verbose, **read_csv_kwargs)
    return result

 # --- CLI ---------------------------------------------------------------------

 def parse_args(argv: list[str]) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Classify CSV files by strict header match and move them to mapped destinations."
    )
    src = p.add_mutually_exclusive_group(required=True)
    src.add_argument("--csv", type=Path, nargs="+",
                     help="One or more CSV file paths to classify.")
    src.add_argument("--dir", type=Path,
                     help="Directory to scan for CSV files (use with --glob).")
    p.add_argument("--glob", default="*.csv",
                   help="Glob pattern when using --dir (default: *.csv).")
    p.add_argument("--recursive", action="store_true",
                   help="Recurse into subdirectories when using --dir.")
    p.add_argument("--dry-run", action="store_true",
                   help="Show intended moves without renaming files.")
    p.add_argument("--encoding", default=None,
                   help="Encoding passed to pandas.read_csv (e.g., utf-8, utf-8-sig).")
    p.add_argument("--delimiter", "--sep", dest="sep", default=None,
                   help="Field delimiter passed to pandas.read_csv.")
    p.add_argument("--verbose", "-v", action="store_true",
                   help="Verbose output.")
    return p.parse_args(argv)

 def gather_paths(args: argparse.Namespace) -> list[Path]:
    if args.csv:
        return [p for p in args.csv if p.is_file()]
    if args.dir:
        if args.recursive:
            return [p for p in args.dir.rglob(args.glob) if p.is_file()]
        return [p for p in args.dir.glob(args.glob) if p.is_file()]
    return []

 def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv or sys.argv[1:])
    files = gather_paths(args)
    if not files:
        print("No CSV files found.", file=sys.stderr)
        return 2

    # pandas.read_csv kwargs
    read_kwargs = {}
    if args.encoding:
        read_kwargs["encoding"] = args.encoding
    if args.sep:
        read_kwargs["sep"] = args.sep

    try:
        classify_many(files, dry_run=args.dry_run, verbose=args.verbose, **read_kwargs)
    except UnrecognizedColumnsError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 3
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 4
    except pd.errors.ParserError as e:
        print(f"CSV parse error: {e}", file=sys.stderr)
        return 5
    return 0

 if __name__ == "__main__":
    raise SystemExit(main())
	#!/usr/bin/env python3
	"""
	A tool for classifying and moving CSV files based on column headers.

	This script processes CSV files by matching their headers against predefined
	layouts. Depending on the matched layout, the files are moved to their
	corresponding destination directories. The script supports both file-based
	and directory-based classification and offers options for dry-run execution,
	recursion, and verbose output.

	It uses pandas for reading CSV files and argparse for command-line parsing.
	"""
	from __future__ import annotations

	import argparse
	from pathlib import Path
	from typing import Dict, Iterable, Tuple
	import sys
	import pandas as pd

	# --- Configuration ------------------------------------------------------------

	# Example canonical column layouts (strict match: names + order)
	COL_TYPES: list[list[str]] = [
	["id", "name", "value"],
	["timestamp", "user", "event"],
	["a", "b", "c"],
	]

	# Destination directories
	DST_A = Path("/path/to/A")
	DST_B = Path("/path/to/B")
	DST_C = Path("/path/to/C")

	# Strict mapping of column layouts → destination dirs
	COL_TO_DST: Dict[Tuple[str, ...], Path] = {
	tuple(COL_TYPES[0]): DST_A,
	tuple(COL_TYPES[1]): DST_B,
	tuple(COL_TYPES[2]): DST_C,
	}

	# --- Helpers -----------------------------------------------------------------

	def normalize_cols(df: pd.DataFrame) -> Tuple[str, ...]:
	return tuple(df.columns.tolist())

	def ensure_dir(p: Path) -> None:
	p.mkdir(parents=True, exist_ok=True)

	def safe_move(src: Path, dst_dir: Path) -> Path:
	"""
	Move `src` into `dst_dir`, avoiding overwrite by adding a numeric suffix
	if a file with the same name exists. Returns the final path.
	"""
	ensure_dir(dst_dir)
	target = dst_dir / src.name
	if not target.exists():
	return src.rename(target)
	stem, suffix = target.stem, target.suffix
	i = 1
	while True:
	candidate = target.with_name(f"{stem}.{i}{suffix}")
	if not candidate.exists():
	return src.rename(candidate)
	i += 1

	# --- Core --------------------------------------------------------------------

	class UnrecognizedColumnsError(ValueError):
	pass

	def classify_csv(csv_path: Path, , dry_run: bool = False, verbose: bool = False, *read_csv_kwargs) -> Path:
	"""
	Read a CSV, strictly match its header against known layouts, and move it
	into the mapped destination directory.

	Returns: final (or intended, if dry_run) destination path.
	"""
	if not csv_path.exists():
	raise FileNotFoundError(csv_path)

	df = pd.read_csv(csv_path, **read_csv_kwargs)
	cols = normalize_cols(df)

	try:
	dst_dir = COL_TO_DST[cols]
	except KeyError:
	raise UnrecognizedColumnsError(f"Unrecognized columns: {cols}")

	intended = dst_dir / csv_path.name
	if dry_run:
	if verbose:
	print(f"[DRY-RUN] {csv_path} -> {intended}")
	return intended

	moved = safe_move(csv_path, dst_dir)
	if verbose:
	print(f"[MOVED] {csv_path} -> {moved}")
	return moved

	def classify_many(paths: Iterable[Path], , dry_run: bool = False, verbose: bool = False, *read_csv_kwargs) -> dict[Path, Path]:
	result: dict[Path, Path] = {}
	for p in paths:
	result[p] = classify_csv(p, dry_run=dry_run, verbose=verbose, **read_csv_kwargs)
	return result

	# --- CLI ---------------------------------------------------------------------

	def parse_args(argv: list[str]) -> argparse.Namespace:
	p = argparse.ArgumentParser(
	description="Classify CSV files by strict header match and move them to mapped destinations."
	)
	src = p.add_mutually_exclusive_group(required=True)
	src.add_argument("--csv", type=Path, nargs="+",
	help="One or more CSV file paths to classify.")
	src.add_argument("--dir", type=Path,
	help="Directory to scan for CSV files (use with --glob).")
	p.add_argument("--glob", default="*.csv",
	help="Glob pattern when using --dir (default: *.csv).")
	p.add_argument("--recursive", action="store_true",
	help="Recurse into subdirectories when using --dir.")
	p.add_argument("--dry-run", action="store_true",
	help="Show intended moves without renaming files.")
	p.add_argument("--encoding", default=None,
	help="Encoding passed to pandas.read_csv (e.g., utf-8, utf-8-sig).")
	p.add_argument("--delimiter", "--sep", dest="sep", default=None,
	help="Field delimiter passed to pandas.read_csv.")
	p.add_argument("--verbose", "-v", action="store_true",
	help="Verbose output.")
	return p.parse_args(argv)

	def gather_paths(args: argparse.Namespace) -> list[Path]:
	if args.csv:
	return [p for p in args.csv if p.is_file()]
	if args.dir:
	if args.recursive:
	return [p for p in args.dir.rglob(args.glob) if p.is_file()]
	return [p for p in args.dir.glob(args.glob) if p.is_file()]
	return []

	def main(argv: list[str] \| None = None) -> int:
	args = parse_args(argv or sys.argv[1:])
	files = gather_paths(args)
	if not files:
	print("No CSV files found.", file=sys.stderr)
	return 2

	# pandas.read_csv kwargs
	read_kwargs = {}
	if args.encoding:
	read_kwargs["encoding"] = args.encoding
	if args.sep:
	read_kwargs["sep"] = args.sep

	try:
	classify_many(files, dry_run=args.dry_run, verbose=args.verbose, **read_kwargs)
	except UnrecognizedColumnsError as e:
	print(f"Error: {e}", file=sys.stderr)
	return 3
	except FileNotFoundError as e:
	print(f"Error: {e}", file=sys.stderr)
	return 4
	except pd.errors.ParserError as e:
	print(f"CSV parse error: {e}", file=sys.stderr)
	return 5
	return 0

	if __name__ == "__main__":
	raise SystemExit(main())
No results found