Skip to content

Instantly share code, notes, and snippets.

@basperheim
Created May 27, 2025 22:22
Show Gist options
  • Save basperheim/829366350126aab798788eb4b98ac59a to your computer and use it in GitHub Desktop.
Save basperheim/829366350126aab798788eb4b98ac59a to your computer and use it in GitHub Desktop.
Python script that finds large files in a directory
import os
import argparse
import logging
# Usage: python3 find_large_files.py ~/Movies
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Directories to ignore while scanning
IGNORE_DIRS = {
'node_modules', 'vendor', 'lib', 'compiled', '__pycache__',
'dist', 'build', '.git', '.venv', '.next', '.nuxt', '.cache'
}
# Only include files larger than this (in bytes)
SIZE_THRESHOLD = 1e6 # 1 MB
PROGRESS_INTERVAL = 500 # How often to log progress
def is_ignored_dir(path):
parts = set(os.path.normpath(path).split(os.sep))
return any(ignored in parts for ignored in IGNORE_DIRS)
def get_file_size(path):
try:
return os.path.getsize(path)
except Exception as e:
logging.warning(f"Could not get size for {path}: {e}")
return None
def find_large_files(root_dir):
large_files = []
total_files = 0
processed_files = 0
logging.info(f"Scanning {root_dir} for large files...")
for root, dirs, files in os.walk(root_dir):
dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
total_files += len(files)
for file in files:
filepath = os.path.join(root, file)
processed_files += 1
if is_ignored_dir(filepath):
continue
size = get_file_size(filepath)
if size is None or size < SIZE_THRESHOLD:
continue
large_files.append({
"filename": filepath,
"size": size
})
if processed_files % PROGRESS_INTERVAL == 0:
logging.info(f"Processed {processed_files} files...")
logging.info(f"Finished processing {processed_files} files (of {total_files} detected).")
return sorted(large_files, key=lambda f: f['size'])
def main():
parser = argparse.ArgumentParser(description="Find large files in a directory.")
parser.add_argument("directory", help="Absolute path to the directory to scan")
args = parser.parse_args()
directory = args.directory
if not os.path.isabs(directory):
parser.error("Please provide an absolute path to the directory")
if not os.path.exists(directory):
parser.error("Directory does not exist")
if not os.path.isdir(directory):
parser.error("Path is not a directory")
large_files = find_large_files(directory)
if large_files:
logging.info("Large files found (sorted by size ascending):")
for f in large_files:
size_mb = f['size'] / (1024 * 1024)
print(f"{size_mb:8.2f} MB - {f['filename']}")
else:
logging.info("No files larger than 1MB were found.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment