Last active
February 20, 2025 14:26
-
-
Save doraTeX/bb77721474174628a144f24a65fa474b to your computer and use it in GitHub Desktop.
A shell script to extract text from PDF on macOS -- https://doratex.hatenablog.jp/entry/20230705/1688554958
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SCRIPTNAME=$(basename "$0") | |
function realpath () { | |
f=$@; | |
if [ -d "$f" ]; then | |
base=""; | |
dir="$f"; | |
else | |
base="/$(basename "$f")"; | |
dir=$(dirname "$f"); | |
fi; | |
dir=$(cd "$dir" && /bin/pwd); | |
echo "$dir$base" | |
} | |
function PDF2Text () { | |
osascript \ | |
-e 'use framework "Quartz"' \ | |
-e "global CA" \ | |
-e "set CA to current application" \ | |
-e "on pdf2text(filePath)" \ | |
-e "set doc to CA's PDFDocument's alloc's initWithURL:(CA's NSURL's fileURLWithPath:filePath)" \ | |
-e "set pageCount to doc's pageCount" \ | |
-e "set resultTexts to CA's NSMutableArray's new()" \ | |
-e "repeat with i from 1 to pageCount" \ | |
-e "(resultTexts's addObject:(((doc's pageAtIndex:(i - 1))'s attributedString)'s |string|))" \ | |
-e "end repeat" \ | |
-e "return (resultTexts's componentsJoinedByString:linefeed) as text" \ | |
-e "end pdf2text" \ | |
-e "set ocrResult to my pdf2text(\"$1\")" 2>/dev/null | |
} | |
function usage() { | |
echo "Usage: $SCRIPTNAME <PATH_TO_PDF_1> <PATH_TO_PDF_2> ..." | |
echo | |
echo "Options:" | |
echo " -h, --help Show help" | |
echo | |
} | |
# parse arguments | |
declare -a args=("$@") | |
declare -a params=() | |
I=0 | |
while [ $I -lt ${#args[@]} ]; do | |
OPT="${args[$I]}" | |
case $OPT in | |
-h | --help ) | |
usage | |
exit 0 | |
;; | |
-- | -) | |
I=$(($I+1)) | |
while [ $I -lt ${#args[@]} ]; do | |
params+=("${args[$I]}") | |
I=$(($I+1)) | |
done | |
break | |
;; | |
-*) | |
echo "$SCRIPTNAME: illegal option -- '$(echo $OPT | sed 's/^-*//')'" 1>&2 | |
exit 1 | |
;; | |
*) | |
if [[ ! -z "$OPT" ]] && [[ ! "$OPT" =~ ^-+ ]]; then | |
params+=( "$OPT" ) | |
fi | |
;; | |
esac | |
I=$(($I+1)) | |
done | |
# handle invalid arguments | |
if [ ${#params[@]} -eq 0 ]; then | |
echo "$SCRIPTNAME: too few arguments" 1>&2 | |
echo "Try '$SCRIPTNAME --help' for more information." 1>&2 | |
exit 1 | |
fi | |
for FILE in "${params[@]}"; do | |
PDF2Text "$(realpath $FILE)" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment