Last active
December 22, 2021 21:54
-
-
Save thomaswilburn/dc40032869c7772405d214056757fb5b to your computer and use it in GitHub Desktop.
Remove redaction blobs from PDF before OCR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -x | |
set -e | |
mkdir -p output | |
cd original | |
for f in *.jpg; do | |
# first image in the stack is the base image | |
convert $f \ | |
# second image is the same thing, but with dilate/erode applied to remove thin lines (i.e., text) | |
# all that remains are blobs and larger shapes | |
\( $f -morphology Dilate Octagon:2 -morphology Erode Octagon:3 \) \ | |
# compose the two together using subtraction | |
# the subtracted image is white-on-black, so negate to reverse it back | |
-compose Minus -composite -negate ../output/$f | |
done | |
cd .. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this version has step-by-step output for debugging | |
set -e | |
set -x | |
# generate page images | |
mkdir -p original | |
pdftocairo -jpeg original.pdf original/page | |
# dilate/erode to just get blob images | |
mkdir -p 2_eroded | |
cd original | |
for f in *.jpg; do | |
convert $f -morphology Dilate Octagon:2 -morphology Erode Octagon:3 ../2_eroded/$f | |
done | |
cd .. | |
# subtract the blobs from the original | |
mkdir -p 3_subtracted | |
cd original | |
for f in *.jpg; do | |
composite -compose Minus ../2_eroded/$f $f ../3_subtracted/$f | |
done | |
cd .. | |
# convert back from negative form | |
mkdir -p 4_reverted | |
cd 3_subtracted | |
for f in *.jpg; do | |
convert $f -negate ../4_reverted/$f | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment