Skip to content

Instantly share code, notes, and snippets.

@lfoppiano
Last active March 4, 2025 23:45
Show Gist options
  • Save lfoppiano/3a016123300ec161e23994d0a6c7de98 to your computer and use it in GitHub Desktop.
Save lfoppiano/3a016123300ec161e23994d0a6c7de98 to your computer and use it in GitHub Desktop.
process_with_pdfalto
#!/bin/bash
export PDFALTO_PATH=./grobid-home/pdfalto/mac_arm-64
# Function to process PDF files
process_pdf_files() {
local input_dir="$1"
local output_dir="$2"
# Create the output directory if it doesn't exist
mkdir -p "$output_dir"
# Loop through all files and directories in the input directory
for entry in "$input_dir"/*; do
if [ -d "$entry" ]; then
# If the entry is a directory, recursively process it
local subdir_name=$(basename "$entry")
process_pdf_files "$entry" "$output_dir/$subdir_name"
elif [ -f "$entry" ] && [[ "$entry" == *.pdf ]]; then
# If the entry is a PDF file, process it with pdfalto
echo "Processing $entry"
local filename=$(basename "$entry")
${PDFALTO_PATH}/./pdfalto -noImageInline -fullFontName -noImage -readingOrder "$entry" "$output_dir/${filename%.pdf}.xml"
fi
done
}
# Main script execution
input_directory="$1"
output_directory="$2"
if [ -z "$input_directory" ] || [ -z "$output_directory" ]; then
echo "Usage: $0 <input_directory> <output_directory>"
exit 1
fi
process_pdf_files "$input_directory" "$output_directory"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment