Last active
February 24, 2025 00:51
-
-
Save matthieuheitz/7287e214b1aeda7948f6c27fbfb5288b to your computer and use it in GitHub Desktop.
djvu2pdf, a conversion script using ocrodjvu and pdfbeads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Method found here https://askubuntu.com/a/122604/423332 | |
# Dependencies: | |
# On ubuntu, you can install ocrodjvu and pdfbeads with: | |
# sudo apt install ocrodjvu | |
# gem install pdfbeads | |
# The path and filename given can only contain ascii characters | |
f=$1 | |
# Get filename | |
filename=$(basename -- "$f") | |
extension="${filename##*.}" | |
file_no_ext="${filename%.*}" | |
# Count number of pages | |
echo "f=$f" | |
p=$(djvused -e n "$f") | |
echo -e "The document contains $p pages.\n" | |
# Number of digits | |
pp=${#p} | |
echo "###############################" | |
echo "### Extracting page by page ###" | |
echo "###############################" | |
# For each page, extract the text, and the image | |
for i in $( seq 1 $p) | |
do | |
ii=$(printf %0${pp}d $i) | |
djvu2hocr -p $i "$f" | sed 's/ocrx/ocr/g' > pg$ii.html | |
ddjvu -format=tiff -page=$i "$f" pg$ii.tiff | |
done | |
echo "" | |
echo "##############################" | |
echo "### Building the final pdf ###" | |
echo "##############################" | |
# Build the final pdf | |
pdfbeads > "$file_no_ext".pdf | |
echo "" | |
echo "Done" | |
# Remove temp files | |
echo "" | |
read -p "Do you want to delete temp files ? (pg*.html, pg*.tiff, pg*.bg.jpg) " -n 1 -r | |
echo # (optional) move to a new line | |
if [[ $REPLY =~ ^[Yy]$ ]] | |
then | |
rm pg*.html pg*.tiff pg*.bg.jpg | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Used it just now and everything worked perfectly. Only quirk was I had to roll back
gem update --system 3.0.8
to getrmagick
to install properly and stop complaining thatconstant Gem::ConfigMap is deprecated
(issue and fix discussed here).Thanks so much for this helpful script!!