Skip to content

Instantly share code, notes, and snippets.

@thibaultmol
Last active August 3, 2023 16:04
Show Gist options
  • Save thibaultmol/7d45013033a50e90ba8e4ff99b741137 to your computer and use it in GitHub Desktop.
Save thibaultmol/7d45013033a50e90ba8e4ff99b741137 to your computer and use it in GitHub Desktop.
Bash script to remove all empty pages from any pdf in the current directory
#!/bin/bash
# Ensure the required tools are installed
command -v pdfimages >/dev/null 2>&1 || { echo >&2 "pdfimages (poppler-utils) is required but it's not installed. Exiting."; exit 1; }
command -v convert >/dev/null 2>&1 || { echo >&2 "convert (ImageMagick) is required but it's not installed. Exiting."; exit 1; }
command -v tesseract >/dev/null 2>&1 || { echo >&2 "tesseract is required but it's not installed. Exiting."; exit 1; }
command -v pdftk >/dev/null 2>&1 || { echo >&2 "pdftk is required but it's not installed. Exiting."; exit 1; }
count_letters() {
echo "$1" | grep -o -i '[a-z]' | wc -l
}
# Create a temporary directory
tmp_dir=$(mktemp -d)
# Process PDFs recursively through subdirectories
find . -type f -name '*.pdf' | while IFS= read -r pdf; do
total_pages=$(pdfinfo "$pdf" | grep Pages | awk '{print $2}')
pages_to_keep=""
for ((page=0; page<total_pages; page++)); do
pdfimages -f $((page+1)) -l $((page+1)) "$pdf" "$tmp_dir/page_$page"
imgs_exist=$(ls "$tmp_dir"/page_${page}-*.{ppm,pbm} 2>/dev/null | wc -l)
if [[ $imgs_exist -gt 0 ]]; then
for img in "$tmp_dir"/page_${page}-*.{ppm,pbm}; do
if [[ -f "$img" ]]; then
convert "$img" "${img%.*}.tiff"
rm "$img"
fi
done
has_text=false
for tiff in "$tmp_dir"/page_${page}-*.tiff; do
if [[ -f "$tiff" ]]; then
tesseract "$tiff" "${tiff%.*}"
rm "$tiff"
fi
done
for txt in "$tmp_dir"/page_${page}-*.txt; do
if [[ -f "$txt" ]]; then
text=$(cat "$txt")
if [[ $(count_letters "$text") -ge 4 ]]; then
pages_to_keep="${pages_to_keep}$((page+1)) "
has_text=true
break
fi
rm "$txt"
fi
done
if [[ "$has_text" == false ]]; then
echo "Page $((page+1)) in $pdf is empty!"
fi
else
pages_to_keep="${pages_to_keep}$((page+1)) "
fi
done
if [[ -n $pages_to_keep ]]; then
# Extract pages without modifications
pdftk "$pdf" cat $pages_to_keep output "${pdf}.tmp"
mv "${pdf}.tmp" "$pdf"
else
echo "All pages in $pdf are empty. No changes made."
fi
rm -rf "$tmp_dir"/*
done
rm -rf "$tmp_dir"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment