bertsky · October 18, 2019 12:45
diff --git a/preprocess-ocrd-gt.sh b/preprocess-ocrd-gt.sh
 # Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis
 # Runs a preprocessing and resegmentation workflow for GT annotation,
 # then extracts page images along JSON descriptions of region polygons and classes;
 # finally, creates a flattened directory under $TARGET.
 # Run: preprocess-ocrd-gt.sh [TARGET-DIRECTORY [METS-FILE]]
 # (default is all METS files anywhere under CWD)

 TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-repair}
 WORKSPACES=${2:-$(find . -name mets.xml)}

 #set -e

 function process {
    echo starting $1
    pushd ${1%mets.xml}
   # fix MIME type:
  sed -i.orig 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml
  # fix PAGE imageFilename:
  if [[ $1 =~ 1000pages ]]; then
    # fix imageFilename (relative to METS, not to PAGE)
    for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
      test -f $file || continue
      sed -i.orig 's|imageFilename="../|imageFilename="|' $file
    done
  else
    # fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
    for page in $(ocrd workspace find -k pageId | sort -u); do 
      img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
      for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do 
 	test -f $file || continue
        sed -i.orig "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file
      done
    done
  fi
  # process
  ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN || \
      ocrd-olena-binarize -I OCR-D-GT-SEG-BLOCK -O OCR-D-GT-SEG-BLOCK-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
  ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE || \
      ocrd-cis-ocropy-denoise -I OCR-D-GT-SEG-BLOCK-BIN -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
  ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW || \
      ocrd-cis-ocropy-deskew -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
  ocrd workspace list-group | grep -e OCR-D-SEG-LINE || \
      ocrd-cis-ocropy-segment -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -O OCR-D-SEG-LINE -p <(echo '{"spread": 2.4}')
  ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK || \
      ocrd-segment-repair -I OCR-D-SEG-LINE -O OCR-D-SEG-BLOCK -p <(echo '{"sanitize": true}')
  ocrd workspace list-group | grep -e OCR-D-IMG-CROP || \
      ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK -O OCR-D-IMG-REGIONS -p <(echo '{"transparency": true}')
  echo done with $1
  popd
 }

 export -f process

 echo starting workflow

 # for mets in $(find . -name mets.xml); do 
 #   sem --id preprocess-ocrd-gt -j6 process $mets || return
 # done

 # sem --id preprocess-ocrd-gt --wait

 parallel process ::: $WORKSPACES

 echo done with workflow
 echo creating flat $TARGET

 mkdir $TARGET
 for file in $(find . -type f -name "OCR-D-IMG-REGIONS_*"); do 
  dir=${file%/OCR-D-IMG-REGIONS/*}
  dir=${dir#./}
  ln -rs $file $TARGET/${dir//\//_}_$(basename $file); 
 done

 #pushd $(dirname $TARGET)
 #tar -chvf $(basename $TARGET).tar $(basename $TARGET)
 #popd

 echo done with everything
diff --git a/tesseract-baseline-ocrd-gt.sh b/tesseract-baseline-ocrd-gt.sh
 # Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis OCR-D/ocrd_tesserocr#80 anyocrbase
 # Runs a preprocessing and segmentation workflow for input images,
 # then extracts page images along JSON descriptions of region polygons and classes;
 # finally, creates a flattened directory under $TARGET.
 # Run: preprocess-ocrd-baseline-tesseract.sh [TARGET-DIRECTORY [METS-FILE]]
 # (default is all METS files anywhere under CWD)

 TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-tess-repair}
 WORKSPACES=${2:-$(find . -name mets.xml)}

 #set -e

 function process {
    echo starting $1
    pushd ${1%mets.xml}
  # fix MIME type:
  sed -i 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml
  # fix PAGE imageFilename:
  if [[ $1 =~ 1000pages ]]; then
    # fix imageFilename (relative to METS, not to PAGE)
    for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
      test -f $file || continue
      sed -i 's|imageFilename="../|imageFilename="|' $file
    done
  else
    # fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
    for page in $(ocrd workspace find -k pageId | sort -u); do 
      img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
      for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do 
 	test -f $file || continue
        sed -i "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file
      done
    done
  fi
  # process
  if ocrd workspace list-group | grep -q -e OCR-D-GT-SEG-PAGE; then
      input_file_group=OCR-D-GT-SEG-PAGE
  else
      ocrd workspace list-group | grep -e OCR-D-SEG-PAGE || \
 	  ocrd-anyocrbase-crop -I OCR-D-IMG -O OCR-D-SEG-PAGE
      input_file_group=OCR-D-SEG-PAGE
  fi
  ocrd workspace list-group | grep -e ${input_file_group}-BIN || \
      ocrd-olena-binarize -I $input_file_group -O ${input_file_grop}-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
  ocrd workspace list-group | grep -e ${input_file_group}-BIN-DENOISE || \
      ocrd-cis-ocropy-denoise -I ${input_file_group}-BIN -O ${input_file_group}-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
  ocrd workspace list-group | grep -e ${input_file_gorup}-BIN-DENOISE-DESKEW || \
      ocrd-cis-ocropy-deskew -I ${input_file_group}-BIN-DENOISE -O ${input_file_group}-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
  ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS || \
      ocrd-tesserocr-segment-region -I ${input_file_group}-BIN-DENOISE-DESKEW -O OCR-D-SEG-BLOCK-TESS
  ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-DESKEW || \
      ocrd-cis-ocropy-deskew -I OCR-D-SEG-BLOCK-TESS -O OCR-D-SEG-BLOCK-TESS-DESKEW -p <(echo '{"level-of-operation": "region"}')
  ocrd workspace list-group | grep -e OCR-D-SEG-LINE-TESS || \
      ocrd-cis-ocropy-segment -I OCR-D-SEG-BLOCK-TESS-DESKEW -O OCR-D-SEG-LINE-TESS -p <(echo '{"spread": 2.4}')
  ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-TIGHT || \
      ocrd-segment-repair -I OCR-D-SEG-LINE-TESS -O OCR-D-SEG-BLOCK-TESS-TIGHT -p <(echo '{"sanitize": true}')
  ocrd workspace list-group | grep -e OCR-D-IMG-CROP-TESS || \
      ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK-TESS-TIGHT -O OCR-D-IMG-REGIONS-TESS -p <(echo '{"transparency": true}')
  echo done with $1
  popd
 }

 export -f process

 echo starting workflow

 # for mets in $(find . -name mets.xml); do 
 #   sem --id preprocess-ocrd-gt -j6 process $mets || return
 # done

 # sem --id preprocess-ocrd-gt --wait

 parallel process ::: $WORKSPACES

 echo done with workflow
 echo creating flat $TARGET

 mkdir $TARGET
 for file in $(find . -type f -name "OCR-D-IMG-REGIONS-TESS_*"); do
  dir=${file%/OCR-D-IMG-REGIONS-TESS/*}; 
  dir=${dir#./}
  ln -rs $file $TARGET/${dir//\//_}_$(basename $file); 
 done

 #pushd $(dirname $TARGET)
 #tar -chvf $(basename $TARGET).tar $(basename $TARGET)
 #popd

 echo done with everything
	# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis
	# Runs a preprocessing and resegmentation workflow for GT annotation,
	# then extracts page images along JSON descriptions of region polygons and classes;
	# finally, creates a flattened directory under $TARGET.
	# Run: preprocess-ocrd-gt.sh [TARGET-DIRECTORY [METS-FILE]]
	# (default is all METS files anywhere under CWD)

	TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-repair}
	WORKSPACES=${2:-$(find . -name mets.xml)}

	#set -e

	function process {
	echo starting $1
	pushd ${1%mets.xml}
	# fix MIME type:
	sed -i.orig 's\|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG\|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG\|' mets.xml
	# fix PAGE imageFilename:
	if [[ $1 =~ 1000pages ]]; then
	# fix imageFilename (relative to METS, not to PAGE)
	for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
	test -f $file \|\| continue
	sed -i.orig 's\|imageFilename="../\|imageFilename="\|' $file
	done
	else
	# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
	for page in $(ocrd workspace find -k pageId \| sort -u); do
	img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
	for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do
	test -f $file \|\| continue
	sed -i.orig "s\|imageFilename=\"[^\"]*\"\|imageFilename=\"$img\"\|" $file
	done
	done
	fi
	# process
	ocrd workspace list-group \| grep -e OCR-D-GT-SEG-BLOCK-BIN \|\| \
	ocrd-olena-binarize -I OCR-D-GT-SEG-BLOCK -O OCR-D-GT-SEG-BLOCK-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
	ocrd workspace list-group \| grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE \|\| \
	ocrd-cis-ocropy-denoise -I OCR-D-GT-SEG-BLOCK-BIN -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
	ocrd workspace list-group \| grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW \|\| \
	ocrd-cis-ocropy-deskew -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
	ocrd workspace list-group \| grep -e OCR-D-SEG-LINE \|\| \
	ocrd-cis-ocropy-segment -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -O OCR-D-SEG-LINE -p <(echo '{"spread": 2.4}')
	ocrd workspace list-group \| grep -e OCR-D-SEG-BLOCK \|\| \
	ocrd-segment-repair -I OCR-D-SEG-LINE -O OCR-D-SEG-BLOCK -p <(echo '{"sanitize": true}')
	ocrd workspace list-group \| grep -e OCR-D-IMG-CROP \|\| \
	ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK -O OCR-D-IMG-REGIONS -p <(echo '{"transparency": true}')
	echo done with $1
	popd
	}

	export -f process

	echo starting workflow

	# for mets in $(find . -name mets.xml); do
	# sem --id preprocess-ocrd-gt -j6 process $mets \|\| return
	# done

	# sem --id preprocess-ocrd-gt --wait

	parallel process ::: $WORKSPACES

	echo done with workflow
	echo creating flat $TARGET

	mkdir $TARGET
	for file in $(find . -type f -name "OCR-D-IMG-REGIONS_*"); do
	dir=${file%/OCR-D-IMG-REGIONS/*}
	dir=${dir#./}
	ln -rs $file $TARGET/${dir//\//_}_$(basename $file);
	done

	#pushd $(dirname $TARGET)
	#tar -chvf $(basename $TARGET).tar $(basename $TARGET)
	#popd

	echo done with everything
	# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis OCR-D/ocrd_tesserocr#80 anyocrbase
	# Runs a preprocessing and segmentation workflow for input images,
	# then extracts page images along JSON descriptions of region polygons and classes;
	# finally, creates a flattened directory under $TARGET.
	# Run: preprocess-ocrd-baseline-tesseract.sh [TARGET-DIRECTORY [METS-FILE]]
	# (default is all METS files anywhere under CWD)

	TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-tess-repair}
	WORKSPACES=${2:-$(find . -name mets.xml)}

	#set -e

	function process {
	echo starting $1
	pushd ${1%mets.xml}
	# fix MIME type:
	sed -i 's\|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG\|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG\|' mets.xml
	# fix PAGE imageFilename:
	if [[ $1 =~ 1000pages ]]; then
	# fix imageFilename (relative to METS, not to PAGE)
	for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
	test -f $file \|\| continue
	sed -i 's\|imageFilename="../\|imageFilename="\|' $file
	done
	else
	# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
	for page in $(ocrd workspace find -k pageId \| sort -u); do
	img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
	for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do
	test -f $file \|\| continue
	sed -i "s\|imageFilename=\"[^\"]*\"\|imageFilename=\"$img\"\|" $file
	done
	done
	fi
	# process
	if ocrd workspace list-group \| grep -q -e OCR-D-GT-SEG-PAGE; then
	input_file_group=OCR-D-GT-SEG-PAGE
	else
	ocrd workspace list-group \| grep -e OCR-D-SEG-PAGE \|\| \
	ocrd-anyocrbase-crop -I OCR-D-IMG -O OCR-D-SEG-PAGE
	input_file_group=OCR-D-SEG-PAGE
	fi
	ocrd workspace list-group \| grep -e ${input_file_group}-BIN \|\| \
	ocrd-olena-binarize -I $input_file_group -O ${input_file_grop}-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
	ocrd workspace list-group \| grep -e ${input_file_group}-BIN-DENOISE \|\| \
	ocrd-cis-ocropy-denoise -I ${input_file_group}-BIN -O ${input_file_group}-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
	ocrd workspace list-group \| grep -e ${input_file_gorup}-BIN-DENOISE-DESKEW \|\| \
	ocrd-cis-ocropy-deskew -I ${input_file_group}-BIN-DENOISE -O ${input_file_group}-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
	ocrd workspace list-group \| grep -e OCR-D-SEG-BLOCK-TESS \|\| \
	ocrd-tesserocr-segment-region -I ${input_file_group}-BIN-DENOISE-DESKEW -O OCR-D-SEG-BLOCK-TESS
	ocrd workspace list-group \| grep -e OCR-D-SEG-BLOCK-TESS-DESKEW \|\| \
	ocrd-cis-ocropy-deskew -I OCR-D-SEG-BLOCK-TESS -O OCR-D-SEG-BLOCK-TESS-DESKEW -p <(echo '{"level-of-operation": "region"}')
	ocrd workspace list-group \| grep -e OCR-D-SEG-LINE-TESS \|\| \
	ocrd-cis-ocropy-segment -I OCR-D-SEG-BLOCK-TESS-DESKEW -O OCR-D-SEG-LINE-TESS -p <(echo '{"spread": 2.4}')
	ocrd workspace list-group \| grep -e OCR-D-SEG-BLOCK-TESS-TIGHT \|\| \
	ocrd-segment-repair -I OCR-D-SEG-LINE-TESS -O OCR-D-SEG-BLOCK-TESS-TIGHT -p <(echo '{"sanitize": true}')
	ocrd workspace list-group \| grep -e OCR-D-IMG-CROP-TESS \|\| \
	ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK-TESS-TIGHT -O OCR-D-IMG-REGIONS-TESS -p <(echo '{"transparency": true}')
	echo done with $1
	popd
	}

	export -f process

	echo starting workflow

	# for mets in $(find . -name mets.xml); do
	# sem --id preprocess-ocrd-gt -j6 process $mets \|\| return
	# done

	# sem --id preprocess-ocrd-gt --wait

	parallel process ::: $WORKSPACES

	echo done with workflow
	echo creating flat $TARGET

	mkdir $TARGET
	for file in $(find . -type f -name "OCR-D-IMG-REGIONS-TESS_*"); do
	dir=${file%/OCR-D-IMG-REGIONS-TESS/*};
	dir=${dir#./}
	ln -rs $file $TARGET/${dir//\//_}_$(basename $file);
	done

	#pushd $(dirname $TARGET)
	#tar -chvf $(basename $TARGET).tar $(basename $TARGET)
	#popd

	echo done with everything