Last active
January 4, 2025 13:10
-
-
Save simonsan/2b0103b5e49c3e6a3a9d9abc0674edca to your computer and use it in GitHub Desktop.
Batch-processes images through OCR (using ocrs-cli), optionally recursive, skips already processed files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Function to display usage | |
usage() { | |
echo "Usage: $0 [-r|--recursive] [-o|--output-dir <directory>] <file|directory>" | |
echo "Options:" | |
echo " -r, --recursive Process images in subdirectories (only applies to directories)" | |
echo " -o, --output-dir Directory to store all OCR text files (default: same as image)" | |
exit 1 | |
} | |
# Check if ocrs is available | |
if ! command -v ocrs >/dev/null 2>&1; then | |
echo "Error: 'ocrs' command not found" | |
echo "Please install it using: cargo install ocrs-cli" | |
echo "If you don't have cargo installed, first install Rust from https://rustup.rs" | |
exit 1 | |
fi | |
# Initialize variables | |
recursive=false | |
INPUT="" | |
OUTPUT_DIR="" | |
# Parse command line arguments | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
-r|--recursive) | |
recursive=true | |
shift | |
;; | |
-o|--output-dir) | |
OUTPUT_DIR="$2" | |
shift 2 | |
;; | |
*) | |
if [ -z "$INPUT" ]; then | |
INPUT="$1" | |
else | |
usage | |
fi | |
shift | |
;; | |
esac | |
done | |
# Check if input is provided | |
if [ -z "$INPUT" ]; then | |
usage | |
fi | |
# Check if input exists | |
if [ ! -e "$INPUT" ]; then | |
echo "Error: '$INPUT' does not exist" | |
exit 1 | |
fi | |
# Check and create output directory if specified | |
if [ ! -z "$OUTPUT_DIR" ]; then | |
if [ ! -d "$OUTPUT_DIR" ]; then | |
mkdir -p "$OUTPUT_DIR" || { | |
echo "Error: Could not create output directory '$OUTPUT_DIR'" | |
exit 1 | |
} | |
fi | |
fi | |
# Initialize counters | |
processed=0 | |
skipped=0 | |
errors=0 | |
# Function to get output path for text file | |
get_output_path() { | |
local img="$1" | |
local img_abs_path=$(realpath "$img") | |
if [ -z "$OUTPUT_DIR" ]; then | |
echo "${img}.txt" | |
else | |
# Create a filename based on the full path of the image | |
local rel_path | |
if [ -d "$INPUT" ]; then | |
rel_path=$(realpath --relative-to="$INPUT" "$img") | |
else | |
rel_path=$(basename "$img") | |
fi | |
local filename=$(echo "$rel_path" | tr '/' '_') | |
echo "$OUTPUT_DIR/${filename}.txt" | |
fi | |
} | |
# Function to process a single image | |
process_image() { | |
local img="$1" | |
local out_file=$(get_output_path "$img") | |
local img_abs_path=$(realpath "$img") | |
if [ -f "$out_file" ]; then | |
echo "Skipping: $img (output file already exists)" | |
echo "s" >> /tmp/ocr_progress | |
else | |
echo "Processing: $img" | |
if output=$(ocrs "$img" 2>&1); then | |
if [ ! -z "$OUTPUT_DIR" ]; then | |
# Add the original file path and the OCR output to the file | |
echo "Original file: $img_abs_path" > "$out_file" | |
echo "" >> "$out_file" | |
echo "$output" >> "$out_file" | |
else | |
# Write output to file next to image | |
echo "$output" > "$out_file" | |
fi | |
echo "p" >> /tmp/ocr_progress | |
else | |
echo "Error processing: $img" | |
echo "$output" | |
echo "e" >> /tmp/ocr_progress | |
# Clean up the potentially empty or corrupted output file | |
rm -f "$out_file" | |
fi | |
fi | |
} | |
# Function to check if file is an image | |
is_image() { | |
local file="$1" | |
case "$(lowercase "${file##*.}")" in | |
jpg|jpeg|png|tiff|bmp) return 0 ;; | |
*) return 1 ;; | |
esac | |
} | |
# Convert string to lowercase | |
lowercase() { | |
echo "$1" | tr '[:upper:]' '[:lower:]' | |
} | |
# Function to process images in directory | |
process_directory() { | |
local directory="$1" | |
local types=("jpg" "jpeg" "png" "tiff" "bmp") | |
for type in "${types[@]}"; do | |
while IFS= read -r -d '' img; do | |
process_image "$img" | |
done < <(find "$directory" -maxdepth 1 -type f -iname "*.$type" -print0) | |
done | |
# Process subdirectories if recursive flag is set | |
if [ "$recursive" = true ]; then | |
while IFS= read -r -d '' subdir; do | |
process_directory "$subdir" | |
done < <(find "$directory" -mindepth 1 -type d -print0) | |
fi | |
} | |
# Create temporary file for counting | |
rm -f /tmp/ocr_progress | |
touch /tmp/ocr_progress | |
# Start processing | |
echo "Starting OCR processing..." | |
if [ -f "$INPUT" ]; then | |
if is_image "$INPUT"; then | |
process_image "$INPUT" | |
else | |
echo "Error: '$INPUT' is not a supported image file" | |
exit 1 | |
fi | |
elif [ -d "$INPUT" ]; then | |
echo "Directory: $INPUT" | |
echo "Recursive mode: $recursive" | |
process_directory "$INPUT" | |
else | |
echo "Error: '$INPUT' is neither a file nor a directory" | |
exit 1 | |
fi | |
if [ ! -z "$OUTPUT_DIR" ]; then | |
echo "Output directory: $OUTPUT_DIR" | |
fi | |
echo "-------------------" | |
# Count final results | |
processed=$(grep -c "p" /tmp/ocr_progress) | |
skipped=$(grep -c "s" /tmp/ocr_progress) | |
errors=$(grep -c "e" /tmp/ocr_progress) | |
rm -f /tmp/ocr_progress | |
echo "-------------------" | |
echo "Processing complete!" | |
echo "Files processed: $processed" | |
echo "Files skipped: $skipped" | |
echo "Files failed: $errors" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment