Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active February 13, 2025 05:59
Show Gist options
  • Save yeiichi/95123a6b517d2121087b15e57206fb89 to your computer and use it in GitHub Desktop.
Save yeiichi/95123a6b517d2121087b15e57206fb89 to your computer and use it in GitHub Desktop.
Create a filename / sha256 checksum list and save as a CSV file.
#!/usr/bin/env python3
import csv
from hashlib import sha256
from pathlib import Path
def hash_by_256(fpath):
"""
Computes a SHA-256 hash of a file using variable chunk sizes based on file size.
This function calculates the SHA-256 hash of the contents of a file located at
the provided file path. The file is processed in its entirety or in chunks,
depending on the file size. For smaller files, it processes them in one read,
while for larger files, it reads and processes them in chunks of varying size.
Args:
fpath (str): The path to the file whose hash needs to be calculated. It
should point to an accessible and valid file on the filesystem.
Returns:
str: The computed SHA-256 hash as a hexadecimal string.
"""
# Get file size in bytes
file_size = Path(fpath).stat().st_size
# Decide processing strategy based on file size
match file_size:
case size if size <= 1024 * 512: # Files <= 512 KB -> Process in one go
with open(fpath, 'rb') as f:
return sha256(f.read()).hexdigest() # Read entire small file
case size if size <= 1024 * 1024 * 10: # Files <= 10 MB -> Use 4 KB chunks
chunk_size = 4096
case _: # Files > 10 MB -> Use 8 KB chunks
chunk_size = 8192
# Process file in chunks
file_hash = sha256()
with open(fpath, 'rb') as f:
while chunk := f.read(chunk_size):
file_hash.update(chunk)
return file_hash.hexdigest()
def list_fpaths_in_dir(src_dir):
"""Lists all file paths in the specified directory.
This function scans a given directory and retrieves a list of file paths,
excluding hidden files and directories (those starting with a dot). Only
files are included in the returned list.
Args:
src_dir (str): The directory path to scan for file paths.
Returns:
List[Path]: A list of file paths present in the specified directory.
"""
return [i for i in Path(src_dir).glob('[!.]*') if i.is_file()]
def make_fname_vs_sha_tbl(list_of_fpath):
header = [['filename', 'sha256']]
return header + [[i.name, hash_by_256(i)] for i in list_of_fpath]
def prevent_overwrite(fpath):
"""
Prevents overwriting an existing file by appending a numerical suffix to the file name.
This function checks whether the given file path refers to an existing file. If it does,
it modifies the file name by appending an incrementing numerical suffix, maintaining the
original file's stem and extension. The resulting file path ensures uniqueness, preventing
overwriting of the existing file.
Args:
fpath (Path): The original file path to check for potential overwriting.
Returns:
Path: A modified file path with a numerical suffix if the original file exists, or
the original path if no file exists with that name.
"""
current_path = fpath
i = 1
while current_path.is_file():
current_path = current_path.with_name(
f'{current_path.name}_{i:03}{current_path.suffix}')
i += 1
return current_path
def save_as_csv(csv_path, list_of_lists):
with open(csv_path, 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerows(list_of_lists)
print(f'\033[93mSaved {csv_path}\033[0m')
if __name__ == '__main__':
my_src_dir = Path(input('DIR? >> ').strip())
fpath_list = sorted(list_fpaths_in_dir(str(my_src_dir)))
arr = make_fname_vs_sha_tbl(fpath_list)
out_csv = prevent_overwrite(
Path(my_src_dir / f'sha256_{my_src_dir.name}')).with_suffix('.csv')
save_as_csv(out_csv, arr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment