Skip to content

Instantly share code, notes, and snippets.

@lucianmachado
Created January 16, 2025 01:44
Show Gist options
  • Save lucianmachado/50ae0771e51d01c8e76dc640d93ba9cc to your computer and use it in GitHub Desktop.
Save lucianmachado/50ae0771e51d01c8e76dc640d93ba9cc to your computer and use it in GitHub Desktop.
Scans the filesystem for files larger than a user-defined size, stores paths in a report, and includes an optional removal feature using a previously generated report. Allows filtering by file extensions and can simulate or execute file deletions.
#!/usr/bin/env python3
import os
import sys
import re
from pathlib import Path
import time
import argparse
from datetime import datetime
from collections import deque
def convert_size(size_bytes):
"""Converts bytes to a readable format (KB, MB, GB, TB)"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024.0:
return f"{size_bytes:.2f}{unit}"
size_bytes /= 1024.0
def get_size_in_gb(size_bytes):
"""Converts bytes to GB"""
return size_bytes / (1024 ** 3)
def parse_extensions(extensions_str):
"""Converts an extension string into a set of normalized extensions"""
if not extensions_str:
return set()
return {ext.strip().lower() if ext.startswith('.') else f'.{ext.strip().lower()}'
for ext in extensions_str.split(',')}
def should_process_file(filepath, only_extensions, ignored_extensions):
"""Checks whether the file should be processed based on extension filters"""
file_ext = Path(filepath).suffix.lower()
if only_extensions:
return file_ext in only_extensions
return file_ext not in ignored_extensions
def update_display(header_info, display_buffer, only_extensions, ignored_extensions):
"""Updates the screen with the header and the file buffer"""
print("\033[H\033[J")
print("=" * 80)
print(f"Searching for files larger than {header_info['min_size']}GB...")
if only_extensions:
print(f"Searching only for extensions: {', '.join(sorted(only_extensions))}")
elif ignored_extensions:
print(f"Ignoring extensions: {', '.join(sorted(ignored_extensions))}")
print("=" * 80)
print(f"\nTotal found: {get_size_in_gb(header_info['total_size']):.2f}GB")
print(f"Files found: {header_info['total_files']}")
print("=" * 80 + "\n")
if display_buffer:
for filepath, size in display_buffer:
print(f"{filepath:<70} {convert_size(size):>10}")
def process_removal_report(report_file, filter_pattern=None, ignore_pattern=None, dry_run=True):
"""Processes a report file for file removal"""
if not os.path.exists(report_file):
print(f"Error: File {report_file} not found")
return
total_size = 0
files_to_remove = []
filter_regex = re.compile(filter_pattern) if filter_pattern else None
ignore_regex = re.compile(ignore_pattern) if ignore_pattern else None
with open(report_file, 'r') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
if filter_regex and not filter_regex.search(line):
continue
if ignore_regex and ignore_regex.search(line):
continue
try:
if os.path.exists(line):
size = os.path.getsize(line)
files_to_remove.append((line, size))
total_size += size
except (OSError, PermissionError) as e:
print(f"Error accessing {line}: {e}")
print("=" * 80)
print(f"{'SIMULATION' if dry_run else 'REMOVAL'} OF FILES")
if filter_pattern:
print(f"Selection filter (filter): {filter_pattern}")
if ignore_pattern:
print(f"Exclusion filter (ignore): {ignore_pattern}")
print("=" * 80)
print(f"\nTotal to be removed: {convert_size(total_size)}")
print(f"Files affected: {len(files_to_remove)}")
print("=" * 80 + "\n")
for filepath, size in files_to_remove:
if dry_run:
print(f"[SIMULATION] Would remove: {filepath:<70} {convert_size(size):>10}")
else:
try:
print(f"Removing: {filepath:<70} {convert_size(size):>10}")
os.remove(filepath)
except (OSError, PermissionError) as e:
print(f"Error removing {filepath}: {e}")
print("\n" + "=" * 80)
if dry_run:
print("Simulation completed! Use --execute to perform actual removal.")
else:
print("Removal completed!")
print("=" * 80)
def find_large_files(min_size_gb, max_display=30, start_path='/', output_file='large_files.txt',
ignore_extensions=None, only_extensions=None):
min_size_bytes = min_size_gb * (1024 ** 3)
ignored_extensions = parse_extensions(ignore_extensions) if not only_extensions else set()
only_extensions_set = parse_extensions(only_extensions)
header_info = {
'min_size': min_size_gb,
'total_size': 0,
'total_files': 0
}
display_buffer = deque(maxlen=max_display)
with open(output_file, 'w') as f:
f.write(f"# Files larger than {min_size_gb}GB found on {datetime.now()}\n")
if only_extensions_set:
f.write(f"# Searching only for extensions: {', '.join(sorted(only_extensions_set))}\n")
elif ignored_extensions:
f.write(f"# Ignoring extensions: {', '.join(sorted(ignored_extensions))}\n")
update_display(header_info, display_buffer, only_extensions_set, ignored_extensions)
try:
for root, dirs, files in os.walk(start_path):
try:
for name in files:
try:
filepath = os.path.join(root, name)
if not should_process_file(filepath, only_extensions_set, ignored_extensions):
continue
size = os.path.getsize(filepath)
if size >= min_size_bytes:
header_info['total_size'] += size
header_info['total_files'] += 1
abs_path = os.path.abspath(filepath)
display_buffer.append((abs_path, size))
update_display(header_info, display_buffer, only_extensions_set, ignored_extensions)
with open(output_file, 'a') as f:
f.write(f"{abs_path}\n")
except (OSError, PermissionError):
continue
except (OSError, PermissionError):
continue
except KeyboardInterrupt:
print("\nSearch interrupted by user.")
print("\n" + "=" * 80)
print(f"Search finished! {header_info['total_files']} files found.")
print(f"Total size: {get_size_in_gb(header_info['total_size']):.2f}GB")
print(f"Complete file list saved in: {output_file}")
print("=" * 80)
def main():
parser = argparse.ArgumentParser(description='Find and manage large files on the system')
subparsers = parser.add_subparsers(dest='command', help='Available commands')
find_parser = subparsers.add_parser('find', help='Search for large files')
find_parser.add_argument('min_size', type=float, help='Minimum size in GB')
find_parser.add_argument('--max-display', type=int, default=30,
help='Maximum number of files to display (default: 30)')
find_parser.add_argument('--path', type=str, default='/',
help='Path to start searching')
find_parser.add_argument('--output', type=str, default='large_files.txt',
help='Output file')
find_parser.add_argument('--ignore-extensions', type=str,
help='List of extensions to ignore, separated by comma (e.g. iso,vmdk,vdi)')
find_parser.add_argument('--only-extensions', type=str,
help='List of extensions to search for, separated by comma (e.g. mp4,mkv,avi)')
remove_parser = subparsers.add_parser('remove', help='Remove files from a previous report')
remove_parser.add_argument('report', type=str, help='Previously generated report file')
remove_parser.add_argument('--filter', type=str, help='Regular expression to select files')
remove_parser.add_argument('--ignore', type=str, help='Regular expression to ignore files')
remove_parser.add_argument('--execute', action='store_true',
help='Execute removal (without this, runs in simulation mode)')
args = parser.parse_args()
if args.command == 'find':
find_large_files(
args.min_size,
args.max_display,
args.path,
args.output,
args.ignore_extensions,
args.only_extensions
)
elif args.command == 'remove':
process_removal_report(args.report, args.filter, args.ignore, not args.execute)
else:
parser.print_help()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment