Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created August 24, 2025 03:36
Show Gist options
  • Select an option

  • Save yeiichi/97808f630a1b1b426c0537c42012e639 to your computer and use it in GitHub Desktop.

Select an option

Save yeiichi/97808f630a1b1b426c0537c42012e639 to your computer and use it in GitHub Desktop.
Combine all `.xlsx` files present in a specified directory
#!/usr/bin/env python3
"""
A program to concatenate Excel files in a directory.
This script combines all `.xlsx` files present in a specified directory, ensuring
they each have identical and consistent columns (order-sensitive). The combined
data is written into a single output Excel file. Additionally, duplicate rows are
dropped from the final result, and the index is reset. An output file name can
either be provided via the command-line arguments or auto-generated based on
the current timestamp.
"""
from argparse import ArgumentParser
from datetime import datetime
from pathlib import Path
import pandas as pd
from pandas import DataFrame
FILE_NAME_GLOB = '*.xlsx'
TIMESTAMP_FMT = '%Y%m%d_%H%M'
def build_parser() -> ArgumentParser:
"""Builds and returns an ArgumentParser"""
parser = ArgumentParser(description='Concatenate Excel files.')
parser.add_argument(
'source_dir',
type=Path,
help='Path to the source directory containing Excel files.'
)
parser.add_argument(
'-o', '--output_file',
type=Path,
default=None,
help='Output .xlsx path (default: auto-generated next to CWD).'
)
return parser
def list_excel_files(source_dir: Path) -> list[Path]:
"""
Return a deterministically ordered list of .xlsx files in the given directory.
"""
return sorted(source_dir.glob(FILE_NAME_GLOB))
def ensure_single_sheet(xl_path: Path) -> None:
"""
Ensure the Excel file contains exactly one sheet, or raise a ValueError.
"""
with pd.ExcelFile(xl_path) as xls:
if len(xls.sheet_names) != 1:
raise ValueError(f"Excel file must have exactly one sheet. ({xl_path})")
def validate_excel_files(source_dir: Path) -> None:
"""
Validate that the directory contains .xlsx files, each has exactly one sheet,
and that all files share the same columns (order-sensitive).
Raises SystemExit/ValueError on problems.
"""
excel_files = list_excel_files(source_dir)
if not excel_files:
raise SystemExit(f"No Excel files found in: {source_dir}")
first_excel_file = excel_files[0]
ensure_single_sheet(first_excel_file)
# Read only header to get columns without loading full file
first_columns = pd.read_excel(first_excel_file, nrows=0).columns
for each_excel_file in excel_files[1:]:
ensure_single_sheet(each_excel_file)
cols = pd.read_excel(each_excel_file, nrows=0).columns
if not cols.equals(first_columns):
raise ValueError(f"Excel files must have the same columns. Mismatch found in: {each_excel_file}")
def concatenate_excel_files(source_dir: Path) -> DataFrame:
"""
Read and vertically concatenate all Excel files in the directory.
Drops duplicate rows and resets the index.
"""
files = list_excel_files(source_dir)
df_list = [pd.read_excel(src_xl) for src_xl in files]
if not df_list:
return pd.DataFrame()
return pd.concat(df_list, ignore_index=True).drop_duplicates()
def create_output_file_name() -> str:
"""
Create a default output file name with a timestamp suffix.
"""
dttm = datetime.now().strftime(TIMESTAMP_FMT)
return f'concatenated_excels_{dttm}.xlsx'
def save_output_file(df: DataFrame, output_file: str | Path) -> None:
df.to_excel(output_file, index=False)
def main() -> None:
args = build_parser().parse_args()
source_dir: Path = args.source_dir
output_file: Path | None = args.output_file
if not source_dir.is_dir():
raise SystemExit(f"Source directory does not exist: {source_dir}")
# Compute default output name lazily if not provided
if output_file is None:
output_file = Path(create_output_file_name())
validate_excel_files(source_dir)
df = concatenate_excel_files(source_dir)
save_output_file(df, output_file)
print(f"Output file saved: {output_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment