Skip to content

Instantly share code, notes, and snippets.

@nordinrahman
Last active May 5, 2025 04:58
Show Gist options
  • Save nordinrahman/fd4155749ed9688f943e7fccd3388bc6 to your computer and use it in GitHub Desktop.
Save nordinrahman/fd4155749ed9688f943e7fccd3388bc6 to your computer and use it in GitHub Desktop.
This is a script to cleanup a folder and its sub directories, so that they could be compressed
#!/bin/bash
# Resolve absolute path of the script
SCRIPT_PATH=$(realpath "$0")
DRY_RUN=false
TARGET_DIR=""
# Set MAX_JOBS to number of logical processors, fallback to 4
MAX_JOBS=${MAX_JOBS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}
# Flag to suppress startup header in recursive calls
SILENT=false
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
shift
;;
--max-jobs)
MAX_JOBS="$2"
shift 2
;;
--silent)
SILENT=true
shift
;;
*)
if [ -z "$TARGET_DIR" ]; then
TARGET_DIR="$1"
fi
shift
;;
esac
done
if [ -z "$TARGET_DIR" ]; then
echo "Error: Please provide a directory path"
echo "Usage: $0 /path/to/directory [--dry-run] [--max-jobs N] [--silent]"
exit 1
fi
TARGET_DIR=$(realpath "$TARGET_DIR")
if [ ! -d "$TARGET_DIR" ]; then
echo "Error: Resolved path does not exist or is not a directory: $TARGET_DIR"
exit 1
fi
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
command -v dotnet >/dev/null 2>&1
DOTNET_AVAILABLE=$?
# Terminal directories (build artifacts, caches, and IDE folders)
TERMINAL_DIRS="node_modules bin obj packages target TestResults __pycache__ build dist .vs .npm .yarn .pytest_cache venv .venv .mvn .vscode .idea .eclipse .settings .cache"
# Terminal files (common ignored files, expanded for .NET user-specific files)
TERMINAL_FILES="*.user *.suo *.csproj.user *.sln.cache *.dbmdl *.pyc *.pyo *.cache project.lock.json npm-debug.log yarn-error.log .coverage coverage.xml *.egg *.log dependency-reduced-pom.xml .project .classpath"
process_git_repo() {
local dir="$1"
echo -e "${GREEN}Processing Git repository at: $dir${NC}"
cd "$dir" || return
# Check if the repository has a valid HEAD (i.e., at least one commit)
if ! git rev-parse --verify HEAD >/dev/null 2>&1; then
echo -e "${YELLOW}Warning: No valid HEAD found in repository (possibly empty or uninitialized). Treating as empty repository.${NC}"
# Clean untracked files if any, without stashing
local git_status=$(git status --porcelain 2>/dev/null)
if [ -n "$git_status" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would clean untracked files with: git clean -x -d -f${NC}"
else
git clean -x -d -f >/dev/null 2>&1
echo "Cleaned untracked files (no stash created as repository is empty)"
fi
fi
# Process submodules if any
if [ -f ".gitmodules" ]; then
echo "Found submodules, processing in parallel..."
git submodule status --recursive 2>/dev/null | awk '{print $2}' | xargs -P "$MAX_JOBS" -I {} bash "$SCRIPT_PATH" "{}" $DRY_RUN_FLAG --silent
fi
return
fi
# Get all tracked files from the Git index
local tracked_files=$(git ls-files)
if [ -n "$tracked_files" ]; then
# Check if any tracked files physically exist
local tracked_files_exist=false
while IFS= read -r file; do
if [ -e "$file" ]; then
tracked_files_exist=true
break
fi
done <<< "$tracked_files"
if [ "$tracked_files_exist" = true ]; then
# Check for an existing stash from a prior run
local stash_message=$(git stash list --format="%gs" | head -n 1)
local current_head=$(git rev-parse HEAD 2>/dev/null || echo "")
local stash_base=$(git rev-parse "stash@{0}^1" 2>/dev/null || echo "")
local git_status=$(git status --porcelain)
# Check if all changes are deletions
local all_deletions=true
if [ -n "$git_status" ]; then
while IFS= read -r line; do
if ! echo "$line" | grep -q "^D "; then
all_deletions=false
break
fi
done <<< "$git_status"
fi
# Skip stashing if stash exists, matches HEAD, and all changes are deletions
if echo "$stash_message" | grep -q "force-stash-before-backup" && [ "$stash_base" = "$current_head" ] && [ "$all_deletions" = true ]; then
echo "Existing stash 'force-stash-before-backup' found at index 0, based on current HEAD with all tracked files pending deletion. Skipping stash creation."
else
# Stage and stash changes if there are any
echo "Tracked files exist physically or non-deletion changes detected, processing for backup..."
if [ -n "$git_status" ]; then
echo "Found pending changes or untracked files, staging and stashing them..."
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would stage all changes with: git add .${NC}"
echo -e "${YELLOW}[DRY-RUN] Would stash changes with: git stash push -m 'force-stash-before-backup' --include-untracked${NC}"
echo -e "${YELLOW}[DRY-RUN] Would clean with: git clean -x -d -f${NC}"
else
git add .
git stash push -m "force-stash-before-backup" --include-untracked
git clean -x -d -f
echo "Stashed changes and cleaned untracked/ignored files"
fi
fi
# Remove tracked files
echo "Removing tracked files..."
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would remove tracked files${NC}"
else
echo "$tracked_files" | xargs -P "$MAX_JOBS" -n 100 rm -rf
echo "Removed tracked files"
fi
fi
else
echo "All tracked files are physically deleted; repository is already prepared for backup."
# Check for an existing stash to confirm
local stash_message=$(git stash list --format="%gs" | head -n 1)
local current_head=$(git rev-parse HEAD 2>/dev/null || echo "")
local stash_base=$(git rev-parse "stash@{0}^1" 2>/dev/null || echo "")
if echo "$stash_message" | grep -q "force-stash-before-backup" && [ "$stash_base" = "$current_head" ]; then
echo "Confirmed existing stash 'force-stash-before-backup' at index 0 matches current HEAD."
else
echo "Note: No valid stash found for current HEAD, but all tracked files are deleted."
fi
# Clean untracked files if any, without stashing
local git_status=$(git status --porcelain)
if [ -n "$git_status" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would clean untracked files with: git clean -x -d -f${NC}"
else
git clean -x -d -f
echo "Cleaned untracked files (no stash created as all tracked files are deleted)"
fi
fi
fi
else
echo "No tracked files exist in index; repository is already prepared for backup."
# Clean untracked files if any, without stashing
local git_status=$(git status --porcelain)
if [ -n "$git_status" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would clean untracked files with: git clean -x -d -f${NC}"
else
git clean -x -d -f
echo "Cleaned untracked files (no stash created as no tracked files present)"
fi
fi
fi
# Process submodules in parallel
if [ -f ".gitmodules" ]; then
echo "Found submodules, processing in parallel..."
git submodule status --recursive 2>/dev/null | awk '{print $2}' | xargs -P "$MAX_JOBS" -I {} bash "$SCRIPT_PATH" "{}" $DRY_RUN_FLAG --silent
fi
}
cleanup_dotnet() {
local dir="$1"
echo -e "${YELLOW}Processing .NET project at: $dir${NC}"
cd "$dir" || return
if ls *.csproj *.fsproj *.vbproj >/dev/null 2>&1; then
if [ "$DOTNET_AVAILABLE" -eq 0 ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would run: dotnet clean${NC}"
else
dotnet clean
echo "Ran dotnet clean"
fi
fi
if [ "$DRY_RUN" = true ]; then
find . -type d \( -name bin -o -name obj -o -name TestResults \) -exec echo -e "${YELLOW}[DRY-RUN] Would remove .NET artifact: {}" \;
[ -d "packages" ] && echo -e "${YELLOW}[DRY-RUN] Would remove NuGet packages folder${NC}"
else
find . -type d \( -name bin -o -name obj -o -name TestResults \) -print0 | xargs -0 -P "$MAX_JOBS" rm -rf
[ -d "packages" ] && rm -rf "packages" && echo "Removed NuGet packages folder"
fi
fi
}
cleanup_non_git() {
local dir="$1"
echo -e "${YELLOW}Processing non-Git directory at: $dir${NC}"
cd "$dir" || return
# Handle terminal directories with .gitignore exceptions
if [ -f ".gitignore" ]; then
for terminal_dir in $TERMINAL_DIRS; do
if [ -d "$terminal_dir" ]; then
exceptions=$(grep -E "^!$terminal_dir/.*" ".gitignore" | sed "s/^!$terminal_dir\///" | tr '\n' ' ')
if [ -n "$exceptions" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would preserve exceptions in $terminal_dir: $exceptions${NC}"
echo -e "${YELLOW}[DRY-RUN] Would remove $terminal_dir except exceptions${NC}"
else
for exception in $exceptions; do
if [ -e "$terminal_dir/$exception" ]; then
mkdir -p ".tmp_preserve"
mv "$terminal_dir/$exception" ".tmp_preserve/"
echo "Preserved $terminal_dir/$exception"
fi
done
rm -rf "$terminal_dir"
echo "Removed terminal directory: $terminal_dir"
[ -d ".tmp_preserve" ] && mv .tmp_preserve/* "$terminal_dir/" && rmdir .tmp_preserve
fi
else
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would remove terminal directory: $terminal_dir${NC}"
else
rm -rf "$terminal_dir"
echo "Removed terminal directory: $terminal_dir"
fi
fi
fi
done
else
# Parallel deletion of terminal directories
echo "$TERMINAL_DIRS" | tr ' ' '\n' | grep -v '^$' | while read -r terminal_dir; do
if [ -d "$terminal_dir" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would remove terminal directory: $terminal_dir${NC}"
else
rm -rf "$terminal_dir" &
echo "Removed terminal directory: $terminal_dir"
fi
fi
done
[ "$DRY_RUN" = false ] && wait
fi
# Parallel deletion of terminal files
for pattern in $TERMINAL_FILES; do
if [ "$DRY_RUN" = true ]; then
find . -maxdepth 1 -type f -name "$pattern" -exec echo -e "${YELLOW}[DRY-RUN] Would remove terminal file: {}" \;
else
find . -maxdepth 1 -type f -name "$pattern" -print0 | xargs -0 -P "$MAX_JOBS" rm -f &
fi
done
[ "$DRY_RUN" = false ] && wait
# .NET cleanup
cleanup_dotnet "$dir"
# Non-terminal cleanup
if [ "$DRY_RUN" = true ]; then
find . -type d -name \*.egg-info -exec echo -e "${YELLOW}[DRY-RUN] Would remove Python egg-info folder: {}" \;
else
find . -type d -name \*.egg-info -print0 | xargs -0 -P "$MAX_JOBS" rm -rf
fi
}
process_directory() {
local dir="$1"
if [ ! -d "$dir" ]; then
echo -e "${RED}Error: Directory not found: $dir${NC}"
return
fi
cd "$dir" || return
# Check if this is a terminal directory
dir_name=$(basename "$dir")
if echo "$TERMINAL_DIRS" | grep -qw "$dir_name"; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would remove terminal directory: $dir${NC}"
else
rm -rf "$dir" && echo "Removed terminal directory: $dir"
fi
return
fi
# Check if we're within a Git repository
if git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
local tracked_files=$(git ls-files "$dir")
if [ -z "$tracked_files" ]; then
if [ "$DRY_RUN" = true ]; then
echo -e "${YELLOW}[DRY-RUN] Would remove untracked Git subdirectory: $dir${NC}"
else
rm -rf "$dir" && echo "Removed untracked Git subdirectory: $dir"
fi
return
fi
fi
# Process as Git repo or recurse
if [ -d ".git" ]; then
process_git_repo "$dir"
else
cleanup_non_git "$dir"
# Parallel processing of subdirectories
find . -maxdepth 1 -type d ! -path . | xargs -P "$MAX_JOBS" -I {} bash "$SCRIPT_PATH" "{}" $DRY_RUN_FLAG --silent
fi
}
DRY_RUN_FLAG=""
if [ "$DRY_RUN" = true ]; then
DRY_RUN_FLAG="--dry-run"
fi
if [ "$SILENT" = false ]; then
echo "Starting cleanup process..."
echo "Target directory: $TARGET_DIR"
echo "Dry run mode: $DRY_RUN"
echo "Max parallel jobs: $MAX_JOBS"
echo "Dotnet CLI available: $([ "$DOTNET_AVAILABLE" -eq 0 ] && echo "Yes" || echo "No")"
echo "-----------------------------"
fi
process_directory "$TARGET_DIR"
if [ "$SILENT" = false ]; then
echo "-----------------------------"
echo "Cleanup process completed"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment