Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active March 27, 2025 11:06
Show Gist options
  • Save yeiichi/d6a11a3ba6f21b0744e18a2524e07446 to your computer and use it in GitHub Desktop.
Save yeiichi/d6a11a3ba6f21b0744e18a2524e07446 to your computer and use it in GitHub Desktop.
Handles concatenation of multiple CSV files in a specified directory into a single CSV file.
#!/usr/bin/env python3
from datetime import datetime
from pathlib import Path
import pandas as pd
class ConcatCSV:
"""
Handles concatenation of multiple CSV files in a specified directory into a single CSV file.
This class provides functionality to validate a directory containing CSV files,
concatenate these files into a unified DataFrame, and save the merged output as a
new CSV file. It validates directory existence, checks if CSV files are available,
and optionally prompts for overwriting an existing output file.
Attributes:
csv_dir (Path): Path to the directory containing CSV files for processing.
csv_files (list): List of CSV files found in the specified directory.
output_path (Path): Path where the merged CSV file will be saved.
"""
_ERROR_COLOR = "\033[91m"
_WARN_COLOR = "\033[93m"
_RESET_COLOR = "\033[0m"
def __init__(self, csv_dir, dedup):
"""
Initializes an object that processes and validates CSV files in a specified directory.
The class ensures the directory exists, lists all CSV files within the directory, and
sets an output file path. Optionally, duplicate entries in CSV files can be excluded.
Args:
csv_dir (str or Path): Path to the directory containing the CSV files.
dedup (bool): Flag indicating whether duplicates should be excluded (True)
or included (False).
"""
output_filename = f'merged_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
self.csv_dir = Path(csv_dir) # Ensure csv_dir is a Path object
self.dedup = dedup
self.csv_files = list(self.csv_dir.glob("*.csv")) # List all CSV files
self.output_path = self.csv_dir / output_filename
if not self._run_validations(): # Use a consolidated validation method
return
def _run_validations(self):
"""
Runs all validations needed during initialization. Combines directory,
file existence, and output checks into one method for readability.
Returns:
bool: True if all validations pass, False otherwise.
"""
return (
self._validate_directory()
and self._validate_csv_files()
)
def _validate_directory(self):
if not self.csv_dir.exists() or not self.csv_dir.is_dir():
print(f"{self._ERROR_COLOR}"
f"Error: {self.csv_dir} does not exist or is not a directory."
f"{self._RESET_COLOR}")
return False
return True
def _validate_csv_files(self):
if not self.csv_files:
print(f"{self._ERROR_COLOR}"
f"Error: No CSV files found in the specified directory."
f"{self._RESET_COLOR}")
return False
return True
def get_merged_dataframe(self):
# Convert CSVs into DataFrames
dataframes = []
for csv_file in self.csv_files:
try:
df = pd.read_csv(csv_file, dtype="string")
dataframes.append(df)
except Exception as e:
print(f"{self._ERROR_COLOR}"
f"Error reading {csv_file}: {e}{self._RESET_COLOR}")
if not dataframes:
print(f"{self._ERROR_COLOR}"
f"Error: No valid CSV files to process.{self._RESET_COLOR}")
return None
# Deduplicate rows or not.
df = pd.concat(dataframes, ignore_index=True).convert_dtypes()
if self.dedup:
df = df.drop_duplicates().dropna(how='all')
return df
def save_csv(self):
self.get_merged_dataframe().to_csv(self.output_path, index=False)
print(f"{self._WARN_COLOR}"
f"Saved: {self.output_path}{self._RESET_COLOR}")
if __name__ == '__main__':
my_csv_dir = input("CSV DIR? >> ").strip()
do_dedup = input("Drop Duplicate Rows? (Y/n) >> ").strip().lower() != 'n'
ConcatCSV(csv_dir=my_csv_dir, dedup=do_dedup).save_csv()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment