Last active
October 18, 2019 12:45
-
-
Save bertsky/1f3a69a2defae662ee40e6b00b4d0d39 to your computer and use it in GitHub Desktop.
Commands to prepare pixel classifier training data from OCR-D GT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis | |
# Runs a preprocessing and resegmentation workflow for GT annotation, | |
# then extracts page images along JSON descriptions of region polygons and classes; | |
# finally, creates a flattened directory under $TARGET. | |
# Run: preprocess-ocrd-gt.sh [TARGET-DIRECTORY [METS-FILE]] | |
# (default is all METS files anywhere under CWD) | |
TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-repair} | |
WORKSPACES=${2:-$(find . -name mets.xml)} | |
#set -e | |
function process { | |
echo starting $1 | |
pushd ${1%mets.xml} | |
# fix MIME type: | |
sed -i.orig 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml | |
# fix PAGE imageFilename: | |
if [[ $1 =~ 1000pages ]]; then | |
# fix imageFilename (relative to METS, not to PAGE) | |
for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do | |
test -f $file || continue | |
sed -i.orig 's|imageFilename="../|imageFilename="|' $file | |
done | |
else | |
# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS): | |
for page in $(ocrd workspace find -k pageId | sort -u); do | |
img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename) | |
for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do | |
test -f $file || continue | |
sed -i.orig "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file | |
done | |
done | |
fi | |
# process | |
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN || \ | |
ocrd-olena-binarize -I OCR-D-GT-SEG-BLOCK -O OCR-D-GT-SEG-BLOCK-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}') | |
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE || \ | |
ocrd-cis-ocropy-denoise -I OCR-D-GT-SEG-BLOCK-BIN -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}') | |
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW || \ | |
ocrd-cis-ocropy-deskew -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}') | |
ocrd workspace list-group | grep -e OCR-D-SEG-LINE || \ | |
ocrd-cis-ocropy-segment -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -O OCR-D-SEG-LINE -p <(echo '{"spread": 2.4}') | |
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK || \ | |
ocrd-segment-repair -I OCR-D-SEG-LINE -O OCR-D-SEG-BLOCK -p <(echo '{"sanitize": true}') | |
ocrd workspace list-group | grep -e OCR-D-IMG-CROP || \ | |
ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK -O OCR-D-IMG-REGIONS -p <(echo '{"transparency": true}') | |
echo done with $1 | |
popd | |
} | |
export -f process | |
echo starting workflow | |
# for mets in $(find . -name mets.xml); do | |
# sem --id preprocess-ocrd-gt -j6 process $mets || return | |
# done | |
# sem --id preprocess-ocrd-gt --wait | |
parallel process ::: $WORKSPACES | |
echo done with workflow | |
echo creating flat $TARGET | |
mkdir $TARGET | |
for file in $(find . -type f -name "OCR-D-IMG-REGIONS_*"); do | |
dir=${file%/OCR-D-IMG-REGIONS/*} | |
dir=${dir#./} | |
ln -rs $file $TARGET/${dir//\//_}_$(basename $file); | |
done | |
#pushd $(dirname $TARGET) | |
#tar -chvf $(basename $TARGET).tar $(basename $TARGET) | |
#popd | |
echo done with everything |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis OCR-D/ocrd_tesserocr#80 anyocrbase | |
# Runs a preprocessing and segmentation workflow for input images, | |
# then extracts page images along JSON descriptions of region polygons and classes; | |
# finally, creates a flattened directory under $TARGET. | |
# Run: preprocess-ocrd-baseline-tesseract.sh [TARGET-DIRECTORY [METS-FILE]] | |
# (default is all METS files anywhere under CWD) | |
TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-tess-repair} | |
WORKSPACES=${2:-$(find . -name mets.xml)} | |
#set -e | |
function process { | |
echo starting $1 | |
pushd ${1%mets.xml} | |
# fix MIME type: | |
sed -i 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml | |
# fix PAGE imageFilename: | |
if [[ $1 =~ 1000pages ]]; then | |
# fix imageFilename (relative to METS, not to PAGE) | |
for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do | |
test -f $file || continue | |
sed -i 's|imageFilename="../|imageFilename="|' $file | |
done | |
else | |
# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS): | |
for page in $(ocrd workspace find -k pageId | sort -u); do | |
img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename) | |
for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do | |
test -f $file || continue | |
sed -i "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file | |
done | |
done | |
fi | |
# process | |
if ocrd workspace list-group | grep -q -e OCR-D-GT-SEG-PAGE; then | |
input_file_group=OCR-D-GT-SEG-PAGE | |
else | |
ocrd workspace list-group | grep -e OCR-D-SEG-PAGE || \ | |
ocrd-anyocrbase-crop -I OCR-D-IMG -O OCR-D-SEG-PAGE | |
input_file_group=OCR-D-SEG-PAGE | |
fi | |
ocrd workspace list-group | grep -e ${input_file_group}-BIN || \ | |
ocrd-olena-binarize -I $input_file_group -O ${input_file_grop}-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}') | |
ocrd workspace list-group | grep -e ${input_file_group}-BIN-DENOISE || \ | |
ocrd-cis-ocropy-denoise -I ${input_file_group}-BIN -O ${input_file_group}-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}') | |
ocrd workspace list-group | grep -e ${input_file_gorup}-BIN-DENOISE-DESKEW || \ | |
ocrd-cis-ocropy-deskew -I ${input_file_group}-BIN-DENOISE -O ${input_file_group}-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}') | |
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS || \ | |
ocrd-tesserocr-segment-region -I ${input_file_group}-BIN-DENOISE-DESKEW -O OCR-D-SEG-BLOCK-TESS | |
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-DESKEW || \ | |
ocrd-cis-ocropy-deskew -I OCR-D-SEG-BLOCK-TESS -O OCR-D-SEG-BLOCK-TESS-DESKEW -p <(echo '{"level-of-operation": "region"}') | |
ocrd workspace list-group | grep -e OCR-D-SEG-LINE-TESS || \ | |
ocrd-cis-ocropy-segment -I OCR-D-SEG-BLOCK-TESS-DESKEW -O OCR-D-SEG-LINE-TESS -p <(echo '{"spread": 2.4}') | |
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-TIGHT || \ | |
ocrd-segment-repair -I OCR-D-SEG-LINE-TESS -O OCR-D-SEG-BLOCK-TESS-TIGHT -p <(echo '{"sanitize": true}') | |
ocrd workspace list-group | grep -e OCR-D-IMG-CROP-TESS || \ | |
ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK-TESS-TIGHT -O OCR-D-IMG-REGIONS-TESS -p <(echo '{"transparency": true}') | |
echo done with $1 | |
popd | |
} | |
export -f process | |
echo starting workflow | |
# for mets in $(find . -name mets.xml); do | |
# sem --id preprocess-ocrd-gt -j6 process $mets || return | |
# done | |
# sem --id preprocess-ocrd-gt --wait | |
parallel process ::: $WORKSPACES | |
echo done with workflow | |
echo creating flat $TARGET | |
mkdir $TARGET | |
for file in $(find . -type f -name "OCR-D-IMG-REGIONS-TESS_*"); do | |
dir=${file%/OCR-D-IMG-REGIONS-TESS/*}; | |
dir=${dir#./} | |
ln -rs $file $TARGET/${dir//\//_}_$(basename $file); | |
done | |
#pushd $(dirname $TARGET) | |
#tar -chvf $(basename $TARGET).tar $(basename $TARGET) | |
#popd | |
echo done with everything |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment