Created
August 24, 2025 03:36
-
-
Save yeiichi/97808f630a1b1b426c0537c42012e639 to your computer and use it in GitHub Desktop.
Combine all `.xlsx` files present in a specified directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| A program to concatenate Excel files in a directory. | |
| This script combines all `.xlsx` files present in a specified directory, ensuring | |
| they each have identical and consistent columns (order-sensitive). The combined | |
| data is written into a single output Excel file. Additionally, duplicate rows are | |
| dropped from the final result, and the index is reset. An output file name can | |
| either be provided via the command-line arguments or auto-generated based on | |
| the current timestamp. | |
| """ | |
| from argparse import ArgumentParser | |
| from datetime import datetime | |
| from pathlib import Path | |
| import pandas as pd | |
| from pandas import DataFrame | |
| FILE_NAME_GLOB = '*.xlsx' | |
| TIMESTAMP_FMT = '%Y%m%d_%H%M' | |
| def build_parser() -> ArgumentParser: | |
| """Builds and returns an ArgumentParser""" | |
| parser = ArgumentParser(description='Concatenate Excel files.') | |
| parser.add_argument( | |
| 'source_dir', | |
| type=Path, | |
| help='Path to the source directory containing Excel files.' | |
| ) | |
| parser.add_argument( | |
| '-o', '--output_file', | |
| type=Path, | |
| default=None, | |
| help='Output .xlsx path (default: auto-generated next to CWD).' | |
| ) | |
| return parser | |
| def list_excel_files(source_dir: Path) -> list[Path]: | |
| """ | |
| Return a deterministically ordered list of .xlsx files in the given directory. | |
| """ | |
| return sorted(source_dir.glob(FILE_NAME_GLOB)) | |
| def ensure_single_sheet(xl_path: Path) -> None: | |
| """ | |
| Ensure the Excel file contains exactly one sheet, or raise a ValueError. | |
| """ | |
| with pd.ExcelFile(xl_path) as xls: | |
| if len(xls.sheet_names) != 1: | |
| raise ValueError(f"Excel file must have exactly one sheet. ({xl_path})") | |
| def validate_excel_files(source_dir: Path) -> None: | |
| """ | |
| Validate that the directory contains .xlsx files, each has exactly one sheet, | |
| and that all files share the same columns (order-sensitive). | |
| Raises SystemExit/ValueError on problems. | |
| """ | |
| excel_files = list_excel_files(source_dir) | |
| if not excel_files: | |
| raise SystemExit(f"No Excel files found in: {source_dir}") | |
| first_excel_file = excel_files[0] | |
| ensure_single_sheet(first_excel_file) | |
| # Read only header to get columns without loading full file | |
| first_columns = pd.read_excel(first_excel_file, nrows=0).columns | |
| for each_excel_file in excel_files[1:]: | |
| ensure_single_sheet(each_excel_file) | |
| cols = pd.read_excel(each_excel_file, nrows=0).columns | |
| if not cols.equals(first_columns): | |
| raise ValueError(f"Excel files must have the same columns. Mismatch found in: {each_excel_file}") | |
| def concatenate_excel_files(source_dir: Path) -> DataFrame: | |
| """ | |
| Read and vertically concatenate all Excel files in the directory. | |
| Drops duplicate rows and resets the index. | |
| """ | |
| files = list_excel_files(source_dir) | |
| df_list = [pd.read_excel(src_xl) for src_xl in files] | |
| if not df_list: | |
| return pd.DataFrame() | |
| return pd.concat(df_list, ignore_index=True).drop_duplicates() | |
| def create_output_file_name() -> str: | |
| """ | |
| Create a default output file name with a timestamp suffix. | |
| """ | |
| dttm = datetime.now().strftime(TIMESTAMP_FMT) | |
| return f'concatenated_excels_{dttm}.xlsx' | |
| def save_output_file(df: DataFrame, output_file: str | Path) -> None: | |
| df.to_excel(output_file, index=False) | |
| def main() -> None: | |
| args = build_parser().parse_args() | |
| source_dir: Path = args.source_dir | |
| output_file: Path | None = args.output_file | |
| if not source_dir.is_dir(): | |
| raise SystemExit(f"Source directory does not exist: {source_dir}") | |
| # Compute default output name lazily if not provided | |
| if output_file is None: | |
| output_file = Path(create_output_file_name()) | |
| validate_excel_files(source_dir) | |
| df = concatenate_excel_files(source_dir) | |
| save_output_file(df, output_file) | |
| print(f"Output file saved: {output_file}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment