Created
January 16, 2025 01:44
-
-
Save lucianmachado/50ae0771e51d01c8e76dc640d93ba9cc to your computer and use it in GitHub Desktop.
Scans the filesystem for files larger than a user-defined size, stores paths in a report, and includes an optional removal feature using a previously generated report. Allows filtering by file extensions and can simulate or execute file deletions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import re | |
from pathlib import Path | |
import time | |
import argparse | |
from datetime import datetime | |
from collections import deque | |
def convert_size(size_bytes): | |
"""Converts bytes to a readable format (KB, MB, GB, TB)""" | |
for unit in ['B', 'KB', 'MB', 'GB', 'TB']: | |
if size_bytes < 1024.0: | |
return f"{size_bytes:.2f}{unit}" | |
size_bytes /= 1024.0 | |
def get_size_in_gb(size_bytes): | |
"""Converts bytes to GB""" | |
return size_bytes / (1024 ** 3) | |
def parse_extensions(extensions_str): | |
"""Converts an extension string into a set of normalized extensions""" | |
if not extensions_str: | |
return set() | |
return {ext.strip().lower() if ext.startswith('.') else f'.{ext.strip().lower()}' | |
for ext in extensions_str.split(',')} | |
def should_process_file(filepath, only_extensions, ignored_extensions): | |
"""Checks whether the file should be processed based on extension filters""" | |
file_ext = Path(filepath).suffix.lower() | |
if only_extensions: | |
return file_ext in only_extensions | |
return file_ext not in ignored_extensions | |
def update_display(header_info, display_buffer, only_extensions, ignored_extensions): | |
"""Updates the screen with the header and the file buffer""" | |
print("\033[H\033[J") | |
print("=" * 80) | |
print(f"Searching for files larger than {header_info['min_size']}GB...") | |
if only_extensions: | |
print(f"Searching only for extensions: {', '.join(sorted(only_extensions))}") | |
elif ignored_extensions: | |
print(f"Ignoring extensions: {', '.join(sorted(ignored_extensions))}") | |
print("=" * 80) | |
print(f"\nTotal found: {get_size_in_gb(header_info['total_size']):.2f}GB") | |
print(f"Files found: {header_info['total_files']}") | |
print("=" * 80 + "\n") | |
if display_buffer: | |
for filepath, size in display_buffer: | |
print(f"{filepath:<70} {convert_size(size):>10}") | |
def process_removal_report(report_file, filter_pattern=None, ignore_pattern=None, dry_run=True): | |
"""Processes a report file for file removal""" | |
if not os.path.exists(report_file): | |
print(f"Error: File {report_file} not found") | |
return | |
total_size = 0 | |
files_to_remove = [] | |
filter_regex = re.compile(filter_pattern) if filter_pattern else None | |
ignore_regex = re.compile(ignore_pattern) if ignore_pattern else None | |
with open(report_file, 'r') as f: | |
for line in f: | |
line = line.strip() | |
if not line or line.startswith('#'): | |
continue | |
if filter_regex and not filter_regex.search(line): | |
continue | |
if ignore_regex and ignore_regex.search(line): | |
continue | |
try: | |
if os.path.exists(line): | |
size = os.path.getsize(line) | |
files_to_remove.append((line, size)) | |
total_size += size | |
except (OSError, PermissionError) as e: | |
print(f"Error accessing {line}: {e}") | |
print("=" * 80) | |
print(f"{'SIMULATION' if dry_run else 'REMOVAL'} OF FILES") | |
if filter_pattern: | |
print(f"Selection filter (filter): {filter_pattern}") | |
if ignore_pattern: | |
print(f"Exclusion filter (ignore): {ignore_pattern}") | |
print("=" * 80) | |
print(f"\nTotal to be removed: {convert_size(total_size)}") | |
print(f"Files affected: {len(files_to_remove)}") | |
print("=" * 80 + "\n") | |
for filepath, size in files_to_remove: | |
if dry_run: | |
print(f"[SIMULATION] Would remove: {filepath:<70} {convert_size(size):>10}") | |
else: | |
try: | |
print(f"Removing: {filepath:<70} {convert_size(size):>10}") | |
os.remove(filepath) | |
except (OSError, PermissionError) as e: | |
print(f"Error removing {filepath}: {e}") | |
print("\n" + "=" * 80) | |
if dry_run: | |
print("Simulation completed! Use --execute to perform actual removal.") | |
else: | |
print("Removal completed!") | |
print("=" * 80) | |
def find_large_files(min_size_gb, max_display=30, start_path='/', output_file='large_files.txt', | |
ignore_extensions=None, only_extensions=None): | |
min_size_bytes = min_size_gb * (1024 ** 3) | |
ignored_extensions = parse_extensions(ignore_extensions) if not only_extensions else set() | |
only_extensions_set = parse_extensions(only_extensions) | |
header_info = { | |
'min_size': min_size_gb, | |
'total_size': 0, | |
'total_files': 0 | |
} | |
display_buffer = deque(maxlen=max_display) | |
with open(output_file, 'w') as f: | |
f.write(f"# Files larger than {min_size_gb}GB found on {datetime.now()}\n") | |
if only_extensions_set: | |
f.write(f"# Searching only for extensions: {', '.join(sorted(only_extensions_set))}\n") | |
elif ignored_extensions: | |
f.write(f"# Ignoring extensions: {', '.join(sorted(ignored_extensions))}\n") | |
update_display(header_info, display_buffer, only_extensions_set, ignored_extensions) | |
try: | |
for root, dirs, files in os.walk(start_path): | |
try: | |
for name in files: | |
try: | |
filepath = os.path.join(root, name) | |
if not should_process_file(filepath, only_extensions_set, ignored_extensions): | |
continue | |
size = os.path.getsize(filepath) | |
if size >= min_size_bytes: | |
header_info['total_size'] += size | |
header_info['total_files'] += 1 | |
abs_path = os.path.abspath(filepath) | |
display_buffer.append((abs_path, size)) | |
update_display(header_info, display_buffer, only_extensions_set, ignored_extensions) | |
with open(output_file, 'a') as f: | |
f.write(f"{abs_path}\n") | |
except (OSError, PermissionError): | |
continue | |
except (OSError, PermissionError): | |
continue | |
except KeyboardInterrupt: | |
print("\nSearch interrupted by user.") | |
print("\n" + "=" * 80) | |
print(f"Search finished! {header_info['total_files']} files found.") | |
print(f"Total size: {get_size_in_gb(header_info['total_size']):.2f}GB") | |
print(f"Complete file list saved in: {output_file}") | |
print("=" * 80) | |
def main(): | |
parser = argparse.ArgumentParser(description='Find and manage large files on the system') | |
subparsers = parser.add_subparsers(dest='command', help='Available commands') | |
find_parser = subparsers.add_parser('find', help='Search for large files') | |
find_parser.add_argument('min_size', type=float, help='Minimum size in GB') | |
find_parser.add_argument('--max-display', type=int, default=30, | |
help='Maximum number of files to display (default: 30)') | |
find_parser.add_argument('--path', type=str, default='/', | |
help='Path to start searching') | |
find_parser.add_argument('--output', type=str, default='large_files.txt', | |
help='Output file') | |
find_parser.add_argument('--ignore-extensions', type=str, | |
help='List of extensions to ignore, separated by comma (e.g. iso,vmdk,vdi)') | |
find_parser.add_argument('--only-extensions', type=str, | |
help='List of extensions to search for, separated by comma (e.g. mp4,mkv,avi)') | |
remove_parser = subparsers.add_parser('remove', help='Remove files from a previous report') | |
remove_parser.add_argument('report', type=str, help='Previously generated report file') | |
remove_parser.add_argument('--filter', type=str, help='Regular expression to select files') | |
remove_parser.add_argument('--ignore', type=str, help='Regular expression to ignore files') | |
remove_parser.add_argument('--execute', action='store_true', | |
help='Execute removal (without this, runs in simulation mode)') | |
args = parser.parse_args() | |
if args.command == 'find': | |
find_large_files( | |
args.min_size, | |
args.max_display, | |
args.path, | |
args.output, | |
args.ignore_extensions, | |
args.only_extensions | |
) | |
elif args.command == 'remove': | |
process_removal_report(args.report, args.filter, args.ignore, not args.execute) | |
else: | |
parser.print_help() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment