Skip to content

Instantly share code, notes, and snippets.

@thushan
Last active August 13, 2025 07:03
Show Gist options
  • Save thushan/c1feb1bc57f7a479dad471e68a7b89a1 to your computer and use it in GitHub Desktop.
Save thushan/c1feb1bc57f7a479dad471e68a7b89a1 to your computer and use it in GitHub Desktop.
Index number of files in a directory
#!/usr/bin/env bash
# License: Apache-2.0
# Author: Thushan Fernando <[email protected]>
# http://github.com/thushan/migration-tools/
#
# Usage: ./migrate-disco.sh /path/to/directory [max_jobs] [max_depth] [strategy]
# Examples:
# ./migrate-disco.sh /mnt/efs
# ./migrate-disco.sh /mnt/efs 16 3 auto
# ./migrate-disco.sh /mnt/efs 16 1 deep
# ./migrate-disco.sh /mnt/efs 16 3 wide
# SAVE_RESULTS=1 ./migrate-disco.sh /mnt/efs
set -euo pipefail
TARGET_DIR="${1:-$(pwd)}"
MAX_JOBS="${2:-$(nproc)}"
MAX_DEPTH="${3:-3}"
STRATEGY="${4:-auto}"
CHUNK_SIZE=1000
CGrey=$(tput setaf 239)
CRed=$(tput setaf 1)
CGreen=$(tput setaf 2)
CYellow=$(tput setaf 3)
CBlue=$(tput setaf 4)
CMagenta=$(tput setaf 5)
CCyan=$(tput setaf 6)
CWhite=$(tput setaf 7)
ENDMARKER=$(tput sgr0)
function msg() {
echo "${CMagenta}$1${ENDMARKER}"
}
function msg_done() {
echo "${CMagenta}$1${ENDMARKER}${CGreen}Done!${ENDMARKER}"
}
function msg3() {
echo "${CWhite}$1${ENDMARKER}"
}
function msg4() {
echo "${CYellow}$1${ENDMARKER}"
}
function check_ok() {
echo "[ ${CGreen}OKAY${ENDMARKER} ] $1"
}
function check_warn() {
echo "[ ${CYellow}WARN${ENDMARKER} ] $1"
}
function check_error() {
echo "[ ${CRed}OOPS${ENDMARKER} ] $1"
}
function check_info() {
echo "[ ${CCyan}INFO${ENDMARKER} ] $1"
}
function fatal() {
echo "${CRed}FATAL:${ENDMARKER} $1${ENDMARKER}" >&2
exit 1
}
if [[ ! -d "$TARGET_DIR" ]]; then
fatal "Directory '$TARGET_DIR' not found"
fi
echo '----------------------------------------'
echo "${CGreen}MIGRATION TOOLS - v2.626 (14/11/2023)${ENDMARKER}"
echo '----------------------------------------'
check_info "Scanning: ${CBlue}$TARGET_DIR${ENDMARKER}"
check_info "Strategy: ${CBlue}$STRATEGY${ENDMARKER}, Jobs: ${CBlue}$MAX_JOBS${ENDMARKER}, Depth: ${CBlue}$MAX_DEPTH${ENDMARKER}"
msg4 "Started at: $(date)"
echo "----------------------------------------"
count_files_simple() {
local dir="$1"
local count
if command -v gfind >/dev/null 2>&1; then
count=$(gfind "$dir" -maxdepth 1 -type f -printf '.' 2>/dev/null | wc -c)
else
count=$(find "$dir" -maxdepth 1 -type f -printf '.' 2>/dev/null | wc -c)
fi
printf "%10d %s\n" "$count" "$dir"
}
count_chunk() {
local chunk_file="$1"
wc -l < "$chunk_file"
}
count_large_dir() {
local dir="$1"
local temp_base="$2"
if command -v gfind >/dev/null 2>&1; then
gfind "$dir" -maxdepth 1 -type f -printf '%f\n' 2>/dev/null
else
find "$dir" -maxdepth 1 -type f -printf '%f\n' 2>/dev/null
fi | split -l "$CHUNK_SIZE" - "$temp_base/"
if command -v parallel >/dev/null 2>&1; then
find "$temp_base" -type f | parallel -j "$MAX_JOBS" count_chunk {} | awk '{sum+=$1} END {print sum}'
else
local total=0
for chunk in "$temp_base"/*; do
if [[ -f "$chunk" ]]; then
((total += $(count_chunk "$chunk")))
fi
done
echo "$total"
fi
}
determine_strategy() {
local target_dir="$1"
local dir_count=$(find "$target_dir" -maxdepth "$MAX_DEPTH" -type d | wc -l)
if [[ "$dir_count" -le 5 ]]; then
echo "deep"
else
echo "wide"
fi
}
run_wide_strategy() {
export -f count_files_simple
TEMP_DIR=$(mktemp -d)
trap 'rm -rf "$TEMP_DIR"' EXIT
RESULTS_FILE="$TEMP_DIR/results"
DIRS_FILE="$TEMP_DIR/dirs"
c_status="Discovering directories (depth $MAX_DEPTH)..."
msg "$c_status"
find "$TARGET_DIR" -maxdepth "$MAX_DEPTH" -type d 2>/dev/null > "$DIRS_FILE"
total_dirs=$(wc -l < "$DIRS_FILE")
check_ok "Found ${CBlue}$total_dirs${ENDMARKER} directories to process"
echo
if command -v parallel >/dev/null 2>&1; then
parallel -j "$MAX_JOBS" --progress count_files_simple {} < "$DIRS_FILE" > "$RESULTS_FILE"
elif command -v xargs >/dev/null 2>&1; then
cat "$DIRS_FILE" | xargs -P "$MAX_JOBS" -I {} bash -c 'count_files_simple "$@"' _ {} > "$RESULTS_FILE"
else
check_warn "No parallel processing available, running sequentially"
while IFS= read -r dir; do
count_files_simple "$dir"
done < "$DIRS_FILE" > "$RESULTS_FILE"
fi
}
run_deep_strategy() {
export -f count_chunk
TEMP_DIR=$(mktemp -d)
trap 'rm -rf "$TEMP_DIR"' EXIT
RESULTS_FILE="$TEMP_DIR/results"
msg "Discovering top-level directories..."
dirs=($(find "$TARGET_DIR" -maxdepth 1 -type d | grep -v "^${TARGET_DIR}$"))
total_dirs=${#dirs[@]}
check_ok "Found ${CBlue}$total_dirs${ENDMARKER} directories to process"
echo
for dir in "${dirs[@]}"; do
dir_name=$(basename "$dir")
temp_chunk_dir="$TEMP_DIR/chunks_${dir_name}"
mkdir -p "$temp_chunk_dir"
msg "Processing: ${CBlue}$dir${ENDMARKER}"
file_count=$(count_large_dir "$dir" "$temp_chunk_dir")
printf "%10d %s\n" "$file_count" "$dir" >> "$RESULTS_FILE"
check_ok "Counted ${CCyan}$file_count${ENDMARKER} files in ${CBlue}$dir_name${ENDMARKER}"
rm -rf "$temp_chunk_dir"
done
}
if [[ "$STRATEGY" == "auto" ]]; then
STRATEGY=$(determine_strategy "$TARGET_DIR")
check_info "Auto-selected strategy: ${CBlue}$STRATEGY${ENDMARKER}"
fi
case "$STRATEGY" in
"wide")
check_info "Using ${CBlue}WIDE${ENDMARKER} strategy (parallel directories)"
run_wide_strategy
;;
"deep")
check_info "Using ${CBlue}DEEP${ENDMARKER} strategy (chunked file processing)"
run_deep_strategy
;;
*)
fatal "Invalid strategy '$STRATEGY'. Use: auto, wide, or deep"
;;
esac
echo
msg4 "Results (sorted by file count):"
echo "----------------------------------------"
printf "%10s %s\n" "FILES" "DIRECTORY"
echo "----------------------------------------"
sort -nr "$RESULTS_FILE" | head -20
echo "----------------------------------------"
total_files=$(awk '{sum += $1} END {print sum}' "$RESULTS_FILE")
avg_files=$(awk '{sum += $1; count++} END {print int(sum/count)}' "$RESULTS_FILE")
max_files=$(sort -nr "$RESULTS_FILE" | head -1 | awk '{print $1}')
msg4 "Summary:"
msg3 " Strategy used: ${CCyan}$STRATEGY${ENDMARKER}"
msg3 " Total directories: ${CCyan}$(wc -l < "$RESULTS_FILE")${ENDMARKER}"
msg3 " Total files: ${CCyan}$total_files${ENDMARKER}"
msg3 " Average files per dir: ${CCyan}$avg_files${ENDMARKER}"
msg3 " Maximum files in one dir: ${CCyan}$max_files${ENDMARKER}"
msg4 "Completed at: $(date)"
if [[ "${SAVE_RESULTS:-}" == "1" ]]; then
output_file="file_count_$(date +%Y%m%d_%H%M%S).txt"
sort -nr "$RESULTS_FILE" > "$output_file"
check_ok "Full results saved to: ${CBlue}$output_file${ENDMARKER}"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment