Skip to content

Instantly share code, notes, and snippets.

@santrancisco
Created July 21, 2025 07:16
Show Gist options
  • Save santrancisco/b4dd1a4ebc66b4130ae9c45a80bb7e54 to your computer and use it in GitHub Desktop.
Save santrancisco/b4dd1a4ebc66b4130ae9c45a80bb7e54 to your computer and use it in GitHub Desktop.
remove_duplicate files and save list as sqlite
#!/usr/bin/env bash
set -euo pipefail
SEARCH_DIR="${1:-.}"
DB_FILE="file_hashes.db"
declare -i COUNT=0
trap 'echo "Error on line $LINENO"; exit 1' ERR
# Init DB if needed
if [ ! -f "$DB_FILE" ]; then
sqlite3 "$DB_FILE" "CREATE TABLE IF NOT EXISTS files (hash TEXT, path TEXT UNIQUE);"
sqlite3 "$DB_FILE" "CREATE INDEX IF NOT EXISTS idx_hash ON files (hash);"
fi
# Process files
find "$SEARCH_DIR" -type f -print0 |
while IFS= read -r -d '' file; do
COUNT=$((COUNT + 1))
if (( COUNT % 1000 == 0 )); then
echo "# Processed $COUNT files..."
fi
# Safely escape the filename using SQLite's quote function
quoted_file=$(sqlite3 "$DB_FILE" "SELECT quote('$(realpath "$file")');")
# Skip if already processed
exists=$(sqlite3 "$DB_FILE" "SELECT 1 FROM files WHERE path = $quoted_file LIMIT 1;" 2>/dev/null || true)
if [ "$exists" = "1" ]; then
continue
fi
# Compute hash
hash=$(md5sum "$file" | awk '{print $1}')
# Look for duplicate
duplicate=$(sqlite3 "$DB_FILE" "SELECT path FROM files WHERE hash = '$hash' AND path != $quoted_file LIMIT 1;")
if [ -n "$duplicate" ]; then
echo "# Duplicate of: $duplicate"
echo "rm $quoted_file"
fi
# Insert into DB
sqlite3 "$DB_FILE" "INSERT OR IGNORE INTO files (hash, path) VALUES ('$hash', $quoted_file);"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment