Last active
October 16, 2024 14:56
-
-
Save bohdon/c51c7a7073659d45679427da098eb4f1 to your computer and use it in GitHub Desktop.
Clean a p4proxy cache to remove unnecessary files and free up disk space.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import annotations | |
import logging | |
import os | |
import re | |
import time | |
import click | |
LOG = logging.getLogger(__name__) | |
logging.basicConfig(format="[%(levelname)8s] %(message)s", level=logging.DEBUG) | |
def sizeof_fmt(num, suffix="B"): | |
for unit in ("", "K", "M", "G", "T", "P", "E", "Z"): | |
if abs(num) < 1000.0: | |
return f"{num:3.1f}{unit}{suffix}" | |
num /= 1000.0 | |
return f"{num:.1f}Y{suffix}" | |
class P4PCacheCleaner(object): | |
""" | |
Util for cleaning a p4proxy cache directory. | |
""" | |
def __init__(self, cache_root: str, latest_only=False, force=False, verbose=False): | |
self.cache_root = cache_root | |
self.latest_only = latest_only | |
self.force = force | |
self.verbose = verbose | |
# regex pattern for revision file names, e.g. 1.12345.gz | |
self.rev_filename_pat = re.compile("1\.(?P<CL>\d+)\.gz") | |
# track how many total bytes were deleted | |
self.files_removed = [] | |
self.bytes_removed = 0 | |
def validate_options(self): | |
# make sure at least one cleaning method is enabled (currently only 1) | |
if not self.latest_only: | |
LOG.warning("No cleaning options were enabled, check --help") | |
return False | |
return True | |
def clean(self): | |
""" | |
Perform the clean | |
""" | |
start_time = time.time() | |
if not os.path.isdir(self.cache_root): | |
LOG.error(f"Cache directory not found: {self.cache_root}") | |
return | |
if not self.validate_options(): | |
return | |
LOG.info(f"Cleaning {self.cache_root}...") | |
for dir_path, dir_names, file_names in os.walk(self.cache_root): | |
if dir_path.endswith(",d"): | |
if dir_names: | |
LOG.error(f"Unexpected subdirectory in {dir_path}") | |
if self.latest_only and len(file_names) > 1: | |
self.remove_all_but_latest(dir_path, file_names) | |
elapsed_time = time.time() - start_time | |
LOG.info(f"Deleted {len(self.files_removed)} file(s), {sizeof_fmt(self.bytes_removed)} ({elapsed_time:.2f}s)") | |
if not self.force: | |
LOG.info("This was a dry-run, use -f to perform the operation") | |
def remove_all_but_latest(self, dir_path: str, file_names: list[str]): | |
""" | |
Remove all files from a directory, except for the 'latest' one, by matching | |
against the `rev_filename_pat` and comparing CL numbers. | |
""" | |
# map by CL number | |
files_by_cl = {} | |
for file_name in file_names: | |
match = self.rev_filename_pat.match(file_name) | |
if not match: | |
LOG.error(f"Unrecognized file name: {file_name} ({dir_path})") | |
continue | |
cl_number = int(match.groupdict()["CL"]) | |
if cl_number in files_by_cl: | |
LOG.error(f"Duplicate CL number: {file_name} ({dir_path})") | |
return | |
file_path = os.path.join(dir_path, file_name) | |
files_by_cl[cl_number] = file_path | |
if not files_by_cl: | |
return | |
max_cl = max(files_by_cl.keys()) | |
for cl, file_path in files_by_cl.items(): | |
if cl != max_cl: | |
self.remove_file(file_path, f"latest: {max_cl}") | |
def remove_file(self, file_path, reason: str = None): | |
""" | |
Delete a file. Does nothing unless `self.force = True`. | |
Args: | |
file_path: The full path to the file to remove. | |
reason: An optional reason to include in verbose logging for why it was removed. | |
""" | |
self.files_removed.append(file_path) | |
file_size = os.path.getsize(file_path) | |
self.bytes_removed += file_size | |
if self.verbose: | |
reason_str = f" ({reason})" if reason else "" | |
LOG.debug(f"Deleting {file_path} ({sizeof_fmt(file_size)}){reason_str}") | |
if self.force: | |
os.remove(file_path) | |
@click.command() | |
@click.option("-r", "--cache", required=True, help="The p4proxy cache directory") | |
@click.option("-h", "--latest-only", is_flag=True, help="Keep only the latest revision of each file") | |
@click.option("-f", "--force", is_flag=True, help="Actually run the operation, otherwise do a dry-run.") | |
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging.") | |
def main(cache: str, latest_only=False, force=False, verbose=False): | |
cleaner = P4PCacheCleaner(cache, latest_only=latest_only, force=force, verbose=verbose) | |
cleaner.clean() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment