Created
October 1, 2022 02:45
-
-
Save qxcv/052d652c47c33afd767d01175843fb58 to your computer and use it in GitHub Desktop.
Find and delete large groups of files that are identical except for a single number (preserving the largest-numbered entry)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import re | |
import os | |
number_re = re.compile(r"([0-9]+)") | |
def number_split_permutations(filename): | |
for match in number_re.finditer(filename): | |
start, end = match.span() | |
prefix = filename[:start] | |
number = int(filename[start:end], base=10) | |
suffix = filename[end:] | |
yield (prefix, suffix), number | |
def build_number_table(filenames): | |
match_table = collections.defaultdict(set) | |
for filename in filenames: | |
for key, number_val in number_split_permutations(filename): | |
match_table[key].add((number_val, filename)) | |
return match_table | |
def delete_files(directory, filenames, min_files, dry_run=True): | |
number_table = build_number_table(filenames) | |
to_delete = set() | |
for key, number_val_pairs in number_table.items(): | |
if len(number_val_pairs) < min_files: | |
continue | |
to_keep, *delete_list = (name for _, name in sorted(number_val_pairs, reverse=True)) | |
verb = "Would delete" if dry_run else "Will delete" | |
print(f"{verb} '{'*'.join(key)}' from '{directory}', except {to_keep} (deletes {len(delete_list)} files)") | |
to_delete.update(delete_list) | |
if not dry_run: | |
for filename in sorted(to_delete): | |
os.unlink(os.path.join(directory, filename)) | |
def delete_files_recursive(root_directory, *, min_files=11, dry_run=True): | |
"""Recursively clean directories with many files of the form `<prefix>_<number>_<suffix>`. | |
Specifically, for each directory below `root_directory`, this script looks for groups of files that are identical | |
except for a single number in the filename. For each group of files consisting of more than `min_files` members, | |
this function will delete all such files except the one with the highest number. This is useful for cleaning up | |
numbered training snapshots in ML code. By default, `dry_run=True prevents files from actually being deleted; set | |
dry_run=False to delete files. | |
""" | |
for dirpath, _, filenames in os.walk(root_directory): | |
delete_files(directory=dirpath, filenames=filenames, min_files=min_files, dry_run=dry_run) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment