Last active
September 9, 2021 16:05
-
-
Save kba/2b4dc816d0eaadc5253218b03f97ddb6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
ADD_TO_WORKSPACE=true | |
INPUT_FILE_GROUP=OCR-D-OCR-TESS | |
OUTPUT_FILE_GROUP=TSV | |
DIRECTORY=$PWD | |
PPN= | |
IIIF_URL_TEMPLATE="https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ pageno }}/left,top,width,height/full/0/default.jpg" | |
SCALE_FILEGRP= | |
while [[ $1 = -* ]];do | |
case "$1" in | |
-I|--input-file-grp) INPUT_FILE_GROUP=$2; shift;; | |
-O|--output-file-grp) OUTPUT_FILE_GROUP=$2; shift;; | |
-d|--directory) DIRECTORY=$2; shift;; | |
-P|--ppn) PPN=$2; shift;; | |
-W|--no-add-to-workspace) ADD_TO_WORKSPACE=false;; | |
-w|--add-to-workspace) ADD_TO_WORKSPACE=true;; | |
--scale-by-filegrp) SCALE_FILEGRP=$2; shift;; | |
esac | |
shift | |
done | |
cd "$DIRECTORY" | |
if [[ -z $PPN ]];then | |
PPN=$(basename "$DIRECTORY") | |
fi | |
if [[ ! -d "$OUTPUT_FILE_GROUP" ]];then | |
mkdir $OUTPUT_FILE_GROUP | |
fi | |
for infile in $INPUT_FILE_GROUP/*;do | |
outfile=$OUTPUT_FILE_GROUP/$(basename $infile) | |
outfile=${outfile%.xml}.tsv | |
# XXX this does not help, we need IIIF URL | |
# img=$(grep -Po 'imageFilename=".*?"' "$infile") | |
# img=${img:15:-1} | |
scale_factor=1.0 | |
if [[ -n "$SCALE_FILEGRP" ]];then | |
numeric_part=$(basename $infile|grep -Po '\d+') | |
compare_file=$(find $SCALE_FILEGRP -name "*$numeric_part*") | |
should_width=$(identify -format '%w' $compare_file) | |
is_width=$(grep -Po 'imageWidth="[^"]*' $infile|grep -Po '\d+') | |
scale_factor=$(echo "$is_width / $should_width"|bc) | |
fi | |
pageno=$(basename $infile|grep -Po '\d\d\d\d'|sed 's,^0*,,') | |
iiif_url=$(echo $IIIF_URL_TEMPLATE|sed "s/{{ PPN }}/$PPN/"|sed "s/{{ pageno }}/$(printf '%08d' $pageno)/") | |
page2tsv --purpose OCR --scale-factor $scale_factor --image-url "$iiif_url" $infile $outfile | |
if [[ $ADD_TO_WORKSPACE = true ]];then | |
ocrd workspace add -i $(basename $outfile) -m text/tsv -G $OUTPUT_FILE_GROUP -g PHYS_$(printf '%04d' $pageno) $outfile | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment