Skip to content

Instantly share code, notes, and snippets.

@simonsan
Last active January 4, 2025 13:10
Show Gist options
  • Save simonsan/2b0103b5e49c3e6a3a9d9abc0674edca to your computer and use it in GitHub Desktop.
Save simonsan/2b0103b5e49c3e6a3a9d9abc0674edca to your computer and use it in GitHub Desktop.
Batch-processes images through OCR (using ocrs-cli), optionally recursive, skips already processed files
#!/bin/bash
# Function to display usage
usage() {
echo "Usage: $0 [-r|--recursive] [-o|--output-dir <directory>] <file|directory>"
echo "Options:"
echo " -r, --recursive Process images in subdirectories (only applies to directories)"
echo " -o, --output-dir Directory to store all OCR text files (default: same as image)"
exit 1
}
# Check if ocrs is available
if ! command -v ocrs >/dev/null 2>&1; then
echo "Error: 'ocrs' command not found"
echo "Please install it using: cargo install ocrs-cli"
echo "If you don't have cargo installed, first install Rust from https://rustup.rs"
exit 1
fi
# Initialize variables
recursive=false
INPUT=""
OUTPUT_DIR=""
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-r|--recursive)
recursive=true
shift
;;
-o|--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
*)
if [ -z "$INPUT" ]; then
INPUT="$1"
else
usage
fi
shift
;;
esac
done
# Check if input is provided
if [ -z "$INPUT" ]; then
usage
fi
# Check if input exists
if [ ! -e "$INPUT" ]; then
echo "Error: '$INPUT' does not exist"
exit 1
fi
# Check and create output directory if specified
if [ ! -z "$OUTPUT_DIR" ]; then
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR" || {
echo "Error: Could not create output directory '$OUTPUT_DIR'"
exit 1
}
fi
fi
# Initialize counters
processed=0
skipped=0
errors=0
# Function to get output path for text file
get_output_path() {
local img="$1"
local img_abs_path=$(realpath "$img")
if [ -z "$OUTPUT_DIR" ]; then
echo "${img}.txt"
else
# Create a filename based on the full path of the image
local rel_path
if [ -d "$INPUT" ]; then
rel_path=$(realpath --relative-to="$INPUT" "$img")
else
rel_path=$(basename "$img")
fi
local filename=$(echo "$rel_path" | tr '/' '_')
echo "$OUTPUT_DIR/${filename}.txt"
fi
}
# Function to process a single image
process_image() {
local img="$1"
local out_file=$(get_output_path "$img")
local img_abs_path=$(realpath "$img")
if [ -f "$out_file" ]; then
echo "Skipping: $img (output file already exists)"
echo "s" >> /tmp/ocr_progress
else
echo "Processing: $img"
if output=$(ocrs "$img" 2>&1); then
if [ ! -z "$OUTPUT_DIR" ]; then
# Add the original file path and the OCR output to the file
echo "Original file: $img_abs_path" > "$out_file"
echo "" >> "$out_file"
echo "$output" >> "$out_file"
else
# Write output to file next to image
echo "$output" > "$out_file"
fi
echo "p" >> /tmp/ocr_progress
else
echo "Error processing: $img"
echo "$output"
echo "e" >> /tmp/ocr_progress
# Clean up the potentially empty or corrupted output file
rm -f "$out_file"
fi
fi
}
# Function to check if file is an image
is_image() {
local file="$1"
case "$(lowercase "${file##*.}")" in
jpg|jpeg|png|tiff|bmp) return 0 ;;
*) return 1 ;;
esac
}
# Convert string to lowercase
lowercase() {
echo "$1" | tr '[:upper:]' '[:lower:]'
}
# Function to process images in directory
process_directory() {
local directory="$1"
local types=("jpg" "jpeg" "png" "tiff" "bmp")
for type in "${types[@]}"; do
while IFS= read -r -d '' img; do
process_image "$img"
done < <(find "$directory" -maxdepth 1 -type f -iname "*.$type" -print0)
done
# Process subdirectories if recursive flag is set
if [ "$recursive" = true ]; then
while IFS= read -r -d '' subdir; do
process_directory "$subdir"
done < <(find "$directory" -mindepth 1 -type d -print0)
fi
}
# Create temporary file for counting
rm -f /tmp/ocr_progress
touch /tmp/ocr_progress
# Start processing
echo "Starting OCR processing..."
if [ -f "$INPUT" ]; then
if is_image "$INPUT"; then
process_image "$INPUT"
else
echo "Error: '$INPUT' is not a supported image file"
exit 1
fi
elif [ -d "$INPUT" ]; then
echo "Directory: $INPUT"
echo "Recursive mode: $recursive"
process_directory "$INPUT"
else
echo "Error: '$INPUT' is neither a file nor a directory"
exit 1
fi
if [ ! -z "$OUTPUT_DIR" ]; then
echo "Output directory: $OUTPUT_DIR"
fi
echo "-------------------"
# Count final results
processed=$(grep -c "p" /tmp/ocr_progress)
skipped=$(grep -c "s" /tmp/ocr_progress)
errors=$(grep -c "e" /tmp/ocr_progress)
rm -f /tmp/ocr_progress
echo "-------------------"
echo "Processing complete!"
echo "Files processed: $processed"
echo "Files skipped: $skipped"
echo "Files failed: $errors"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment