-
-
Save kavsingh/1a1bf187b5cb5b4678560e640b1457a4 to your computer and use it in GitHub Desktop.
quick and dirty pdf ocr script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
# based on https://ryanfb.github.io/etc/2014/11/13/command_line_ocr_on_mac_os_x.html | |
# dependencies: | |
# brew install imagemagick tesseract pdftk-java | |
# make sure to chmod +x | |
# then ./ocr some/image.pdf output | |
source_file="$1" | |
dest_dir="$2" | |
[[ -z "$source_file" ]] && { echo "Source file required" ; exit 1; } | |
[[ -z "$dest_dir" ]] && { echo "Destination folder required" ; exit 1; } | |
source_name=$(basename $source_file) | |
processed_name=ocr-$source_name | |
tmp_dir=$dest_dir/$source_name.tmp | |
mkdir -p $dest_dir | |
mkdir -p $tmp_dir | |
echo Converting pdf to tiff... | |
convert -density 300 $source_file $tmp_dir/page_%03d.tif | |
pushd $tmp_dir | |
echo Running OCR... | |
for i in page_*.tif; do echo $i; tesseract $i $(basename $i .tif) pdf; done | |
echo Merging pages... | |
pdftk page_*.pdf cat output $processed_name | |
popd | |
cp $tmp_dir/$processed_name $dest_dir | |
rm -rf $tmp_dir | |
echo OCR complete: $dest_dir/$processed_name | |
open $dest_dir/$processed_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment