Last active
March 4, 2025 23:45
-
-
Save lfoppiano/3a016123300ec161e23994d0a6c7de98 to your computer and use it in GitHub Desktop.
process_with_pdfalto
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export PDFALTO_PATH=./grobid-home/pdfalto/mac_arm-64 | |
# Function to process PDF files | |
process_pdf_files() { | |
local input_dir="$1" | |
local output_dir="$2" | |
# Create the output directory if it doesn't exist | |
mkdir -p "$output_dir" | |
# Loop through all files and directories in the input directory | |
for entry in "$input_dir"/*; do | |
if [ -d "$entry" ]; then | |
# If the entry is a directory, recursively process it | |
local subdir_name=$(basename "$entry") | |
process_pdf_files "$entry" "$output_dir/$subdir_name" | |
elif [ -f "$entry" ] && [[ "$entry" == *.pdf ]]; then | |
# If the entry is a PDF file, process it with pdfalto | |
echo "Processing $entry" | |
local filename=$(basename "$entry") | |
${PDFALTO_PATH}/./pdfalto -noImageInline -fullFontName -noImage -readingOrder "$entry" "$output_dir/${filename%.pdf}.xml" | |
fi | |
done | |
} | |
# Main script execution | |
input_directory="$1" | |
output_directory="$2" | |
if [ -z "$input_directory" ] || [ -z "$output_directory" ]; then | |
echo "Usage: $0 <input_directory> <output_directory>" | |
exit 1 | |
fi | |
process_pdf_files "$input_directory" "$output_directory" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment