Skip to content

Instantly share code, notes, and snippets.

@bohdon
Last active October 16, 2024 14:56
Show Gist options
  • Save bohdon/c51c7a7073659d45679427da098eb4f1 to your computer and use it in GitHub Desktop.
Save bohdon/c51c7a7073659d45679427da098eb4f1 to your computer and use it in GitHub Desktop.
Clean a p4proxy cache to remove unnecessary files and free up disk space.
from __future__ import annotations
import logging
import os
import re
import time
import click
LOG = logging.getLogger(__name__)
logging.basicConfig(format="[%(levelname)8s] %(message)s", level=logging.DEBUG)
def sizeof_fmt(num, suffix="B"):
for unit in ("", "K", "M", "G", "T", "P", "E", "Z"):
if abs(num) < 1000.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1000.0
return f"{num:.1f}Y{suffix}"
class P4PCacheCleaner(object):
"""
Util for cleaning a p4proxy cache directory.
"""
def __init__(self, cache_root: str, latest_only=False, force=False, verbose=False):
self.cache_root = cache_root
self.latest_only = latest_only
self.force = force
self.verbose = verbose
# regex pattern for revision file names, e.g. 1.12345.gz
self.rev_filename_pat = re.compile("1\.(?P<CL>\d+)\.gz")
# track how many total bytes were deleted
self.files_removed = []
self.bytes_removed = 0
def validate_options(self):
# make sure at least one cleaning method is enabled (currently only 1)
if not self.latest_only:
LOG.warning("No cleaning options were enabled, check --help")
return False
return True
def clean(self):
"""
Perform the clean
"""
start_time = time.time()
if not os.path.isdir(self.cache_root):
LOG.error(f"Cache directory not found: {self.cache_root}")
return
if not self.validate_options():
return
LOG.info(f"Cleaning {self.cache_root}...")
for dir_path, dir_names, file_names in os.walk(self.cache_root):
if dir_path.endswith(",d"):
if dir_names:
LOG.error(f"Unexpected subdirectory in {dir_path}")
if self.latest_only and len(file_names) > 1:
self.remove_all_but_latest(dir_path, file_names)
elapsed_time = time.time() - start_time
LOG.info(f"Deleted {len(self.files_removed)} file(s), {sizeof_fmt(self.bytes_removed)} ({elapsed_time:.2f}s)")
if not self.force:
LOG.info("This was a dry-run, use -f to perform the operation")
def remove_all_but_latest(self, dir_path: str, file_names: list[str]):
"""
Remove all files from a directory, except for the 'latest' one, by matching
against the `rev_filename_pat` and comparing CL numbers.
"""
# map by CL number
files_by_cl = {}
for file_name in file_names:
match = self.rev_filename_pat.match(file_name)
if not match:
LOG.error(f"Unrecognized file name: {file_name} ({dir_path})")
continue
cl_number = int(match.groupdict()["CL"])
if cl_number in files_by_cl:
LOG.error(f"Duplicate CL number: {file_name} ({dir_path})")
return
file_path = os.path.join(dir_path, file_name)
files_by_cl[cl_number] = file_path
if not files_by_cl:
return
max_cl = max(files_by_cl.keys())
for cl, file_path in files_by_cl.items():
if cl != max_cl:
self.remove_file(file_path, f"latest: {max_cl}")
def remove_file(self, file_path, reason: str = None):
"""
Delete a file. Does nothing unless `self.force = True`.
Args:
file_path: The full path to the file to remove.
reason: An optional reason to include in verbose logging for why it was removed.
"""
self.files_removed.append(file_path)
file_size = os.path.getsize(file_path)
self.bytes_removed += file_size
if self.verbose:
reason_str = f" ({reason})" if reason else ""
LOG.debug(f"Deleting {file_path} ({sizeof_fmt(file_size)}){reason_str}")
if self.force:
os.remove(file_path)
@click.command()
@click.option("-r", "--cache", required=True, help="The p4proxy cache directory")
@click.option("-h", "--latest-only", is_flag=True, help="Keep only the latest revision of each file")
@click.option("-f", "--force", is_flag=True, help="Actually run the operation, otherwise do a dry-run.")
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging.")
def main(cache: str, latest_only=False, force=False, verbose=False):
cleaner = P4PCacheCleaner(cache, latest_only=latest_only, force=force, verbose=verbose)
cleaner.clean()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment