Last active
August 13, 2025 07:03
-
-
Save thushan/c1feb1bc57f7a479dad471e68a7b89a1 to your computer and use it in GitHub Desktop.
Index number of files in a directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# License: Apache-2.0 | |
# Author: Thushan Fernando <[email protected]> | |
# http://github.com/thushan/migration-tools/ | |
# | |
# Usage: ./migrate-disco.sh /path/to/directory [max_jobs] [max_depth] [strategy] | |
# Examples: | |
# ./migrate-disco.sh /mnt/efs | |
# ./migrate-disco.sh /mnt/efs 16 3 auto | |
# ./migrate-disco.sh /mnt/efs 16 1 deep | |
# ./migrate-disco.sh /mnt/efs 16 3 wide | |
# SAVE_RESULTS=1 ./migrate-disco.sh /mnt/efs | |
set -euo pipefail | |
TARGET_DIR="${1:-$(pwd)}" | |
MAX_JOBS="${2:-$(nproc)}" | |
MAX_DEPTH="${3:-3}" | |
STRATEGY="${4:-auto}" | |
CHUNK_SIZE=1000 | |
CGrey=$(tput setaf 239) | |
CRed=$(tput setaf 1) | |
CGreen=$(tput setaf 2) | |
CYellow=$(tput setaf 3) | |
CBlue=$(tput setaf 4) | |
CMagenta=$(tput setaf 5) | |
CCyan=$(tput setaf 6) | |
CWhite=$(tput setaf 7) | |
ENDMARKER=$(tput sgr0) | |
function msg() { | |
echo "${CMagenta}$1${ENDMARKER}" | |
} | |
function msg_done() { | |
echo "${CMagenta}$1${ENDMARKER}${CGreen}Done!${ENDMARKER}" | |
} | |
function msg3() { | |
echo "${CWhite}$1${ENDMARKER}" | |
} | |
function msg4() { | |
echo "${CYellow}$1${ENDMARKER}" | |
} | |
function check_ok() { | |
echo "[ ${CGreen}OKAY${ENDMARKER} ] $1" | |
} | |
function check_warn() { | |
echo "[ ${CYellow}WARN${ENDMARKER} ] $1" | |
} | |
function check_error() { | |
echo "[ ${CRed}OOPS${ENDMARKER} ] $1" | |
} | |
function check_info() { | |
echo "[ ${CCyan}INFO${ENDMARKER} ] $1" | |
} | |
function fatal() { | |
echo "${CRed}FATAL:${ENDMARKER} $1${ENDMARKER}" >&2 | |
exit 1 | |
} | |
if [[ ! -d "$TARGET_DIR" ]]; then | |
fatal "Directory '$TARGET_DIR' not found" | |
fi | |
echo '----------------------------------------' | |
echo "${CGreen}MIGRATION TOOLS - v2.626 (14/11/2023)${ENDMARKER}" | |
echo '----------------------------------------' | |
check_info "Scanning: ${CBlue}$TARGET_DIR${ENDMARKER}" | |
check_info "Strategy: ${CBlue}$STRATEGY${ENDMARKER}, Jobs: ${CBlue}$MAX_JOBS${ENDMARKER}, Depth: ${CBlue}$MAX_DEPTH${ENDMARKER}" | |
msg4 "Started at: $(date)" | |
echo "----------------------------------------" | |
count_files_simple() { | |
local dir="$1" | |
local count | |
if command -v gfind >/dev/null 2>&1; then | |
count=$(gfind "$dir" -maxdepth 1 -type f -printf '.' 2>/dev/null | wc -c) | |
else | |
count=$(find "$dir" -maxdepth 1 -type f -printf '.' 2>/dev/null | wc -c) | |
fi | |
printf "%10d %s\n" "$count" "$dir" | |
} | |
count_chunk() { | |
local chunk_file="$1" | |
wc -l < "$chunk_file" | |
} | |
count_large_dir() { | |
local dir="$1" | |
local temp_base="$2" | |
if command -v gfind >/dev/null 2>&1; then | |
gfind "$dir" -maxdepth 1 -type f -printf '%f\n' 2>/dev/null | |
else | |
find "$dir" -maxdepth 1 -type f -printf '%f\n' 2>/dev/null | |
fi | split -l "$CHUNK_SIZE" - "$temp_base/" | |
if command -v parallel >/dev/null 2>&1; then | |
find "$temp_base" -type f | parallel -j "$MAX_JOBS" count_chunk {} | awk '{sum+=$1} END {print sum}' | |
else | |
local total=0 | |
for chunk in "$temp_base"/*; do | |
if [[ -f "$chunk" ]]; then | |
((total += $(count_chunk "$chunk"))) | |
fi | |
done | |
echo "$total" | |
fi | |
} | |
determine_strategy() { | |
local target_dir="$1" | |
local dir_count=$(find "$target_dir" -maxdepth "$MAX_DEPTH" -type d | wc -l) | |
if [[ "$dir_count" -le 5 ]]; then | |
echo "deep" | |
else | |
echo "wide" | |
fi | |
} | |
run_wide_strategy() { | |
export -f count_files_simple | |
TEMP_DIR=$(mktemp -d) | |
trap 'rm -rf "$TEMP_DIR"' EXIT | |
RESULTS_FILE="$TEMP_DIR/results" | |
DIRS_FILE="$TEMP_DIR/dirs" | |
c_status="Discovering directories (depth $MAX_DEPTH)..." | |
msg "$c_status" | |
find "$TARGET_DIR" -maxdepth "$MAX_DEPTH" -type d 2>/dev/null > "$DIRS_FILE" | |
total_dirs=$(wc -l < "$DIRS_FILE") | |
check_ok "Found ${CBlue}$total_dirs${ENDMARKER} directories to process" | |
echo | |
if command -v parallel >/dev/null 2>&1; then | |
parallel -j "$MAX_JOBS" --progress count_files_simple {} < "$DIRS_FILE" > "$RESULTS_FILE" | |
elif command -v xargs >/dev/null 2>&1; then | |
cat "$DIRS_FILE" | xargs -P "$MAX_JOBS" -I {} bash -c 'count_files_simple "$@"' _ {} > "$RESULTS_FILE" | |
else | |
check_warn "No parallel processing available, running sequentially" | |
while IFS= read -r dir; do | |
count_files_simple "$dir" | |
done < "$DIRS_FILE" > "$RESULTS_FILE" | |
fi | |
} | |
run_deep_strategy() { | |
export -f count_chunk | |
TEMP_DIR=$(mktemp -d) | |
trap 'rm -rf "$TEMP_DIR"' EXIT | |
RESULTS_FILE="$TEMP_DIR/results" | |
msg "Discovering top-level directories..." | |
dirs=($(find "$TARGET_DIR" -maxdepth 1 -type d | grep -v "^${TARGET_DIR}$")) | |
total_dirs=${#dirs[@]} | |
check_ok "Found ${CBlue}$total_dirs${ENDMARKER} directories to process" | |
echo | |
for dir in "${dirs[@]}"; do | |
dir_name=$(basename "$dir") | |
temp_chunk_dir="$TEMP_DIR/chunks_${dir_name}" | |
mkdir -p "$temp_chunk_dir" | |
msg "Processing: ${CBlue}$dir${ENDMARKER}" | |
file_count=$(count_large_dir "$dir" "$temp_chunk_dir") | |
printf "%10d %s\n" "$file_count" "$dir" >> "$RESULTS_FILE" | |
check_ok "Counted ${CCyan}$file_count${ENDMARKER} files in ${CBlue}$dir_name${ENDMARKER}" | |
rm -rf "$temp_chunk_dir" | |
done | |
} | |
if [[ "$STRATEGY" == "auto" ]]; then | |
STRATEGY=$(determine_strategy "$TARGET_DIR") | |
check_info "Auto-selected strategy: ${CBlue}$STRATEGY${ENDMARKER}" | |
fi | |
case "$STRATEGY" in | |
"wide") | |
check_info "Using ${CBlue}WIDE${ENDMARKER} strategy (parallel directories)" | |
run_wide_strategy | |
;; | |
"deep") | |
check_info "Using ${CBlue}DEEP${ENDMARKER} strategy (chunked file processing)" | |
run_deep_strategy | |
;; | |
*) | |
fatal "Invalid strategy '$STRATEGY'. Use: auto, wide, or deep" | |
;; | |
esac | |
echo | |
msg4 "Results (sorted by file count):" | |
echo "----------------------------------------" | |
printf "%10s %s\n" "FILES" "DIRECTORY" | |
echo "----------------------------------------" | |
sort -nr "$RESULTS_FILE" | head -20 | |
echo "----------------------------------------" | |
total_files=$(awk '{sum += $1} END {print sum}' "$RESULTS_FILE") | |
avg_files=$(awk '{sum += $1; count++} END {print int(sum/count)}' "$RESULTS_FILE") | |
max_files=$(sort -nr "$RESULTS_FILE" | head -1 | awk '{print $1}') | |
msg4 "Summary:" | |
msg3 " Strategy used: ${CCyan}$STRATEGY${ENDMARKER}" | |
msg3 " Total directories: ${CCyan}$(wc -l < "$RESULTS_FILE")${ENDMARKER}" | |
msg3 " Total files: ${CCyan}$total_files${ENDMARKER}" | |
msg3 " Average files per dir: ${CCyan}$avg_files${ENDMARKER}" | |
msg3 " Maximum files in one dir: ${CCyan}$max_files${ENDMARKER}" | |
msg4 "Completed at: $(date)" | |
if [[ "${SAVE_RESULTS:-}" == "1" ]]; then | |
output_file="file_count_$(date +%Y%m%d_%H%M%S).txt" | |
sort -nr "$RESULTS_FILE" > "$output_file" | |
check_ok "Full results saved to: ${CBlue}$output_file${ENDMARKER}" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment