Last active
March 27, 2025 11:06
-
-
Save yeiichi/d6a11a3ba6f21b0744e18a2524e07446 to your computer and use it in GitHub Desktop.
Handles concatenation of multiple CSV files in a specified directory into a single CSV file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from datetime import datetime | |
from pathlib import Path | |
import pandas as pd | |
class ConcatCSV: | |
""" | |
Handles concatenation of multiple CSV files in a specified directory into a single CSV file. | |
This class provides functionality to validate a directory containing CSV files, | |
concatenate these files into a unified DataFrame, and save the merged output as a | |
new CSV file. It validates directory existence, checks if CSV files are available, | |
and optionally prompts for overwriting an existing output file. | |
Attributes: | |
csv_dir (Path): Path to the directory containing CSV files for processing. | |
csv_files (list): List of CSV files found in the specified directory. | |
output_path (Path): Path where the merged CSV file will be saved. | |
""" | |
_ERROR_COLOR = "\033[91m" | |
_WARN_COLOR = "\033[93m" | |
_RESET_COLOR = "\033[0m" | |
def __init__(self, csv_dir, dedup): | |
""" | |
Initializes an object that processes and validates CSV files in a specified directory. | |
The class ensures the directory exists, lists all CSV files within the directory, and | |
sets an output file path. Optionally, duplicate entries in CSV files can be excluded. | |
Args: | |
csv_dir (str or Path): Path to the directory containing the CSV files. | |
dedup (bool): Flag indicating whether duplicates should be excluded (True) | |
or included (False). | |
""" | |
output_filename = f'merged_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv' | |
self.csv_dir = Path(csv_dir) # Ensure csv_dir is a Path object | |
self.dedup = dedup | |
self.csv_files = list(self.csv_dir.glob("*.csv")) # List all CSV files | |
self.output_path = self.csv_dir / output_filename | |
if not self._run_validations(): # Use a consolidated validation method | |
return | |
def _run_validations(self): | |
""" | |
Runs all validations needed during initialization. Combines directory, | |
file existence, and output checks into one method for readability. | |
Returns: | |
bool: True if all validations pass, False otherwise. | |
""" | |
return ( | |
self._validate_directory() | |
and self._validate_csv_files() | |
) | |
def _validate_directory(self): | |
if not self.csv_dir.exists() or not self.csv_dir.is_dir(): | |
print(f"{self._ERROR_COLOR}" | |
f"Error: {self.csv_dir} does not exist or is not a directory." | |
f"{self._RESET_COLOR}") | |
return False | |
return True | |
def _validate_csv_files(self): | |
if not self.csv_files: | |
print(f"{self._ERROR_COLOR}" | |
f"Error: No CSV files found in the specified directory." | |
f"{self._RESET_COLOR}") | |
return False | |
return True | |
def get_merged_dataframe(self): | |
# Convert CSVs into DataFrames | |
dataframes = [] | |
for csv_file in self.csv_files: | |
try: | |
df = pd.read_csv(csv_file, dtype="string") | |
dataframes.append(df) | |
except Exception as e: | |
print(f"{self._ERROR_COLOR}" | |
f"Error reading {csv_file}: {e}{self._RESET_COLOR}") | |
if not dataframes: | |
print(f"{self._ERROR_COLOR}" | |
f"Error: No valid CSV files to process.{self._RESET_COLOR}") | |
return None | |
# Deduplicate rows or not. | |
df = pd.concat(dataframes, ignore_index=True).convert_dtypes() | |
if self.dedup: | |
df = df.drop_duplicates().dropna(how='all') | |
return df | |
def save_csv(self): | |
self.get_merged_dataframe().to_csv(self.output_path, index=False) | |
print(f"{self._WARN_COLOR}" | |
f"Saved: {self.output_path}{self._RESET_COLOR}") | |
if __name__ == '__main__': | |
my_csv_dir = input("CSV DIR? >> ").strip() | |
do_dedup = input("Drop Duplicate Rows? (Y/n) >> ").strip().lower() != 'n' | |
ConcatCSV(csv_dir=my_csv_dir, dedup=do_dedup).save_csv() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment