yeiichi · March 27, 2025 11:06
diff --git a/concat_any_csv.py b/concat_any_csv.py
 #!/usr/bin/env python3
 from datetime import datetime
 from pathlib import Path

 import pandas as pd


 class ConcatCSV:
    """
    Handles concatenation of multiple CSV files in a specified directory into a single CSV file.

    This class provides functionality to validate a directory containing CSV files,
    concatenate these files into a unified DataFrame, and save the merged output as a
    new CSV file. It validates directory existence, checks if CSV files are available,
    and optionally prompts for overwriting an existing output file.

    Attributes:
        csv_dir (Path): Path to the directory containing CSV files for processing.
        csv_files (list): List of CSV files found in the specified directory.
        output_path (Path): Path where the merged CSV file will be saved.
    """
    _ERROR_COLOR = "\033[91m"
    _WARN_COLOR = "\033[93m"
    _RESET_COLOR = "\033[0m"

    def __init__(self, csv_dir, dedup):
        """
        Initializes an object that processes and validates CSV files in a specified directory.
        The class ensures the directory exists, lists all CSV files within the directory, and
        sets an output file path. Optionally, duplicate entries in CSV files can be excluded.

        Args:
            csv_dir (str or Path): Path to the directory containing the CSV files.
            dedup (bool): Flag indicating whether duplicates should be excluded (True)
                or included (False).
        """
        output_filename = f'merged_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
        self.csv_dir = Path(csv_dir)  # Ensure csv_dir is a Path object
        self.dedup = dedup
        self.csv_files = list(self.csv_dir.glob("*.csv"))  # List all CSV files
        self.output_path = self.csv_dir / output_filename
        if not self._run_validations():  # Use a consolidated validation method
            return

    def _run_validations(self):
        """
        Runs all validations needed during initialization. Combines directory,
        file existence, and output checks into one method for readability.

        Returns:
            bool: True if all validations pass, False otherwise.
        """
        return (
                self._validate_directory()
                and self._validate_csv_files()
        )

    def _validate_directory(self):
        if not self.csv_dir.exists() or not self.csv_dir.is_dir():
            print(f"{self._ERROR_COLOR}"
                  f"Error: {self.csv_dir} does not exist or is not a directory."
                  f"{self._RESET_COLOR}")
            return False
        return True

    def _validate_csv_files(self):
        if not self.csv_files:
            print(f"{self._ERROR_COLOR}"
                  f"Error: No CSV files found in the specified directory."
                  f"{self._RESET_COLOR}")
            return False
        return True

    def get_merged_dataframe(self):
        # Convert CSVs into DataFrames
        dataframes = []
        for csv_file in self.csv_files:
            try:
                df = pd.read_csv(csv_file, dtype="string")
                dataframes.append(df)
            except Exception as e:
                print(f"{self._ERROR_COLOR}"
                      f"Error reading {csv_file}: {e}{self._RESET_COLOR}")
        if not dataframes:
            print(f"{self._ERROR_COLOR}"
                  f"Error: No valid CSV files to process.{self._RESET_COLOR}")
            return None
        # Deduplicate rows or not.
        df = pd.concat(dataframes, ignore_index=True).convert_dtypes()
        if self.dedup:
            df = df.drop_duplicates().dropna(how='all')
        return df

    def save_csv(self):
        self.get_merged_dataframe().to_csv(self.output_path, index=False)
        print(f"{self._WARN_COLOR}"
              f"Saved: {self.output_path}{self._RESET_COLOR}")


 if __name__ == '__main__':
    my_csv_dir = input("CSV DIR? >> ").strip()
    do_dedup = input("Drop Duplicate Rows? (Y/n) >> ").strip().lower() != 'n'
    ConcatCSV(csv_dir=my_csv_dir, dedup=do_dedup).save_csv()
	#!/usr/bin/env python3
	from datetime import datetime
	from pathlib import Path

	import pandas as pd


	class ConcatCSV:
	"""
	Handles concatenation of multiple CSV files in a specified directory into a single CSV file.

	This class provides functionality to validate a directory containing CSV files,
	concatenate these files into a unified DataFrame, and save the merged output as a
	new CSV file. It validates directory existence, checks if CSV files are available,
	and optionally prompts for overwriting an existing output file.

	Attributes:
	csv_dir (Path): Path to the directory containing CSV files for processing.
	csv_files (list): List of CSV files found in the specified directory.
	output_path (Path): Path where the merged CSV file will be saved.
	"""
	_ERROR_COLOR = "\033[91m"
	_WARN_COLOR = "\033[93m"
	_RESET_COLOR = "\033[0m"

	def __init__(self, csv_dir, dedup):
	"""
	Initializes an object that processes and validates CSV files in a specified directory.
	The class ensures the directory exists, lists all CSV files within the directory, and
	sets an output file path. Optionally, duplicate entries in CSV files can be excluded.

	Args:
	csv_dir (str or Path): Path to the directory containing the CSV files.
	dedup (bool): Flag indicating whether duplicates should be excluded (True)
	or included (False).
	"""
	output_filename = f'merged_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
	self.csv_dir = Path(csv_dir) # Ensure csv_dir is a Path object
	self.dedup = dedup
	self.csv_files = list(self.csv_dir.glob("*.csv")) # List all CSV files
	self.output_path = self.csv_dir / output_filename
	if not self._run_validations(): # Use a consolidated validation method
	return

	def _run_validations(self):
	"""
	Runs all validations needed during initialization. Combines directory,
	file existence, and output checks into one method for readability.

	Returns:
	bool: True if all validations pass, False otherwise.
	"""
	return (
	self._validate_directory()
	and self._validate_csv_files()
	)

	def _validate_directory(self):
	if not self.csv_dir.exists() or not self.csv_dir.is_dir():
	print(f"{self._ERROR_COLOR}"
	f"Error: {self.csv_dir} does not exist or is not a directory."
	f"{self._RESET_COLOR}")
	return False
	return True

	def _validate_csv_files(self):
	if not self.csv_files:
	print(f"{self._ERROR_COLOR}"
	f"Error: No CSV files found in the specified directory."
	f"{self._RESET_COLOR}")
	return False
	return True

	def get_merged_dataframe(self):
	# Convert CSVs into DataFrames
	dataframes = []
	for csv_file in self.csv_files:
	try:
	df = pd.read_csv(csv_file, dtype="string")
	dataframes.append(df)
	except Exception as e:
	print(f"{self._ERROR_COLOR}"
	f"Error reading {csv_file}: {e}{self._RESET_COLOR}")
	if not dataframes:
	print(f"{self._ERROR_COLOR}"
	f"Error: No valid CSV files to process.{self._RESET_COLOR}")
	return None
	# Deduplicate rows or not.
	df = pd.concat(dataframes, ignore_index=True).convert_dtypes()
	if self.dedup:
	df = df.drop_duplicates().dropna(how='all')
	return df

	def save_csv(self):
	self.get_merged_dataframe().to_csv(self.output_path, index=False)
	print(f"{self._WARN_COLOR}"
	f"Saved: {self.output_path}{self._RESET_COLOR}")


	if __name__ == '__main__':
	my_csv_dir = input("CSV DIR? >> ").strip()
	do_dedup = input("Drop Duplicate Rows? (Y/n) >> ").strip().lower() != 'n'
	ConcatCSV(csv_dir=my_csv_dir, dedup=do_dedup).save_csv()