Last active
August 3, 2023 16:04
-
-
Save thibaultmol/7d45013033a50e90ba8e4ff99b741137 to your computer and use it in GitHub Desktop.
Bash script to remove all empty pages from any pdf in the current directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Ensure the required tools are installed | |
command -v pdfimages >/dev/null 2>&1 || { echo >&2 "pdfimages (poppler-utils) is required but it's not installed. Exiting."; exit 1; } | |
command -v convert >/dev/null 2>&1 || { echo >&2 "convert (ImageMagick) is required but it's not installed. Exiting."; exit 1; } | |
command -v tesseract >/dev/null 2>&1 || { echo >&2 "tesseract is required but it's not installed. Exiting."; exit 1; } | |
command -v pdftk >/dev/null 2>&1 || { echo >&2 "pdftk is required but it's not installed. Exiting."; exit 1; } | |
count_letters() { | |
echo "$1" | grep -o -i '[a-z]' | wc -l | |
} | |
# Create a temporary directory | |
tmp_dir=$(mktemp -d) | |
# Process PDFs recursively through subdirectories | |
find . -type f -name '*.pdf' | while IFS= read -r pdf; do | |
total_pages=$(pdfinfo "$pdf" | grep Pages | awk '{print $2}') | |
pages_to_keep="" | |
for ((page=0; page<total_pages; page++)); do | |
pdfimages -f $((page+1)) -l $((page+1)) "$pdf" "$tmp_dir/page_$page" | |
imgs_exist=$(ls "$tmp_dir"/page_${page}-*.{ppm,pbm} 2>/dev/null | wc -l) | |
if [[ $imgs_exist -gt 0 ]]; then | |
for img in "$tmp_dir"/page_${page}-*.{ppm,pbm}; do | |
if [[ -f "$img" ]]; then | |
convert "$img" "${img%.*}.tiff" | |
rm "$img" | |
fi | |
done | |
has_text=false | |
for tiff in "$tmp_dir"/page_${page}-*.tiff; do | |
if [[ -f "$tiff" ]]; then | |
tesseract "$tiff" "${tiff%.*}" | |
rm "$tiff" | |
fi | |
done | |
for txt in "$tmp_dir"/page_${page}-*.txt; do | |
if [[ -f "$txt" ]]; then | |
text=$(cat "$txt") | |
if [[ $(count_letters "$text") -ge 4 ]]; then | |
pages_to_keep="${pages_to_keep}$((page+1)) " | |
has_text=true | |
break | |
fi | |
rm "$txt" | |
fi | |
done | |
if [[ "$has_text" == false ]]; then | |
echo "Page $((page+1)) in $pdf is empty!" | |
fi | |
else | |
pages_to_keep="${pages_to_keep}$((page+1)) " | |
fi | |
done | |
if [[ -n $pages_to_keep ]]; then | |
# Extract pages without modifications | |
pdftk "$pdf" cat $pages_to_keep output "${pdf}.tmp" | |
mv "${pdf}.tmp" "$pdf" | |
else | |
echo "All pages in $pdf are empty. No changes made." | |
fi | |
rm -rf "$tmp_dir"/* | |
done | |
rm -rf "$tmp_dir" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment