yeiichi · February 13, 2025 05:59
diff --git a/list_files_sha256.py b/list_files_sha256.py
 #!/usr/bin/env python3
 import csv
 from hashlib import sha256
 from pathlib import Path


 def hash_by_256(fpath):
    """
    Computes a SHA-256 hash of a file using variable chunk sizes based on file size.

    This function calculates the SHA-256 hash of the contents of a file located at
    the provided file path. The file is processed in its entirety or in chunks,
    depending on the file size. For smaller files, it processes them in one read,
    while for larger files, it reads and processes them in chunks of varying size.

    Args:
        fpath (str): The path to the file whose hash needs to be calculated. It
            should point to an accessible and valid file on the filesystem.

    Returns:
        str: The computed SHA-256 hash as a hexadecimal string.
    """
    # Get file size in bytes
    file_size = Path(fpath).stat().st_size

    # Decide processing strategy based on file size
    match file_size:
        case size if size <= 1024 * 512:  # Files <= 512 KB -> Process in one go
            with open(fpath, 'rb') as f:
                return sha256(f.read()).hexdigest()  # Read entire small file

        case size if size <= 1024 * 1024 * 10:  # Files <= 10 MB -> Use 4 KB chunks
            chunk_size = 4096

        case _:  # Files > 10 MB -> Use 8 KB chunks
            chunk_size = 8192

    # Process file in chunks
    file_hash = sha256()
    with open(fpath, 'rb') as f:
        while chunk := f.read(chunk_size):
            file_hash.update(chunk)
    return file_hash.hexdigest()


 def list_fpaths_in_dir(src_dir):
    """Lists all file paths in the specified directory.

    This function scans a given directory and retrieves a list of file paths,
    excluding hidden files and directories (those starting with a dot). Only
    files are included in the returned list.

    Args:
        src_dir (str): The directory path to scan for file paths.
    Returns:
        List[Path]: A list of file paths present in the specified directory.
    """
    return [i for i in Path(src_dir).glob('[!.]*') if i.is_file()]


 def make_fname_vs_sha_tbl(list_of_fpath):
    header = [['filename', 'sha256']]
    return header + [[i.name, hash_by_256(i)] for i in list_of_fpath]


 def prevent_overwrite(fpath):
    """
    Prevents overwriting an existing file by appending a numerical suffix to the file name.

    This function checks whether the given file path refers to an existing file. If it does,
    it modifies the file name by appending an incrementing numerical suffix, maintaining the
    original file's stem and extension. The resulting file path ensures uniqueness, preventing
    overwriting of the existing file.

    Args:
        fpath (Path): The original file path to check for potential overwriting.

    Returns:
        Path: A modified file path with a numerical suffix if the original file exists, or
        the original path if no file exists with that name.
    """
    current_path = fpath
    i = 1
    while current_path.is_file():
        current_path = current_path.with_name(
            f'{current_path.name}_{i:03}{current_path.suffix}')
        i += 1
    return current_path


 def save_as_csv(csv_path, list_of_lists):
    with open(csv_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerows(list_of_lists)
    print(f'\033[93mSaved {csv_path}\033[0m')


 if __name__ == '__main__':
    my_src_dir = Path(input('DIR? >> ').strip())
    fpath_list = sorted(list_fpaths_in_dir(str(my_src_dir)))
    arr = make_fname_vs_sha_tbl(fpath_list)
    out_csv = prevent_overwrite(
        Path(my_src_dir / f'sha256_{my_src_dir.name}')).with_suffix('.csv')
    save_as_csv(out_csv, arr)
	#!/usr/bin/env python3
	import csv
	from hashlib import sha256
	from pathlib import Path


	def hash_by_256(fpath):
	"""
	Computes a SHA-256 hash of a file using variable chunk sizes based on file size.

	This function calculates the SHA-256 hash of the contents of a file located at
	the provided file path. The file is processed in its entirety or in chunks,
	depending on the file size. For smaller files, it processes them in one read,
	while for larger files, it reads and processes them in chunks of varying size.

	Args:
	fpath (str): The path to the file whose hash needs to be calculated. It
	should point to an accessible and valid file on the filesystem.

	Returns:
	str: The computed SHA-256 hash as a hexadecimal string.
	"""
	# Get file size in bytes
	file_size = Path(fpath).stat().st_size

	# Decide processing strategy based on file size
	match file_size:
	case size if size <= 1024 * 512: # Files <= 512 KB -> Process in one go
	with open(fpath, 'rb') as f:
	return sha256(f.read()).hexdigest() # Read entire small file

	case size if size <= 1024 * 1024 * 10: # Files <= 10 MB -> Use 4 KB chunks
	chunk_size = 4096

	case _: # Files > 10 MB -> Use 8 KB chunks
	chunk_size = 8192

	# Process file in chunks
	file_hash = sha256()
	with open(fpath, 'rb') as f:
	while chunk := f.read(chunk_size):
	file_hash.update(chunk)
	return file_hash.hexdigest()


	def list_fpaths_in_dir(src_dir):
	"""Lists all file paths in the specified directory.

	This function scans a given directory and retrieves a list of file paths,
	excluding hidden files and directories (those starting with a dot). Only
	files are included in the returned list.

	Args:
	src_dir (str): The directory path to scan for file paths.
	Returns:
	List[Path]: A list of file paths present in the specified directory.
	"""
	return [i for i in Path(src_dir).glob('[!.]*') if i.is_file()]


	def make_fname_vs_sha_tbl(list_of_fpath):
	header = [['filename', 'sha256']]
	return header + [[i.name, hash_by_256(i)] for i in list_of_fpath]


	def prevent_overwrite(fpath):
	"""
	Prevents overwriting an existing file by appending a numerical suffix to the file name.

	This function checks whether the given file path refers to an existing file. If it does,
	it modifies the file name by appending an incrementing numerical suffix, maintaining the
	original file's stem and extension. The resulting file path ensures uniqueness, preventing
	overwriting of the existing file.

	Args:
	fpath (Path): The original file path to check for potential overwriting.

	Returns:
	Path: A modified file path with a numerical suffix if the original file exists, or
	the original path if no file exists with that name.
	"""
	current_path = fpath
	i = 1
	while current_path.is_file():
	current_path = current_path.with_name(
	f'{current_path.name}_{i:03}{current_path.suffix}')
	i += 1
	return current_path


	def save_as_csv(csv_path, list_of_lists):
	with open(csv_path, 'w', newline='') as csvfile:
	csv_writer = csv.writer(csvfile)
	csv_writer.writerows(list_of_lists)
	print(f'\033[93mSaved {csv_path}\033[0m')


	if __name__ == '__main__':
	my_src_dir = Path(input('DIR? >> ').strip())
	fpath_list = sorted(list_fpaths_in_dir(str(my_src_dir)))
	arr = make_fname_vs_sha_tbl(fpath_list)
	out_csv = prevent_overwrite(
	Path(my_src_dir / f'sha256_{my_src_dir.name}')).with_suffix('.csv')
	save_as_csv(out_csv, arr)