AkBKukU · July 28, 2024 13:14
diff --git a/pdf-search.sh b/pdf-search.sh
 #!/usr/env bash

 # This script works by using pdftotext and pdfgrep along with standard grep to make a faster PDF searching tool.

 # PDF searching is slow, searching through plain text is fast. This tool extracts all plain text and stores it seperately.
 # When a search is done it is checked against the plain text first where a list of matching files are stored. 
 # It then searches only the matching PDFs to get the page numbers and print the snipit of data.

 # TODO - Add loose matching for bad OCR by doing multiple passes replacing visually similar characters and putting white space in query.

 key="$1"

 dir_text="text"

 # Make text dir
 if [[ ! -d "$dir_text" ]]
 then
    mkdir "$dir_text"
 fi

 # Index all PDFs
 for file in *.pdf
 do
    if [[ ! -f "$dir_text/$file.txt" ]]
    then
        pdftotext "$file" "$dir_text/$file.txt"
    fi
 done

 files=( $(grep -i "$key" text/*.txt | awk -F ":" '{print $1}' | sort -u | sed 's|\.txt||g' | sed 's|text/||g') )

 for file in "${files[@]}"
 do
 	echo "In [$file]"
 	pdfgrep -ni "$key" "$file"
 done
	#!/usr/env bash

	# This script works by using pdftotext and pdfgrep along with standard grep to make a faster PDF searching tool.

	# PDF searching is slow, searching through plain text is fast. This tool extracts all plain text and stores it seperately.
	# When a search is done it is checked against the plain text first where a list of matching files are stored.
	# It then searches only the matching PDFs to get the page numbers and print the snipit of data.

	# TODO - Add loose matching for bad OCR by doing multiple passes replacing visually similar characters and putting white space in query.

	key="$1"

	dir_text="text"

	# Make text dir
	if [[ ! -d "$dir_text" ]]
	then
	mkdir "$dir_text"
	fi

	# Index all PDFs
	for file in *.pdf
	do
	if [[ ! -f "$dir_text/$file.txt" ]]
	then
	pdftotext "$file" "$dir_text/$file.txt"
	fi
	done

	files=( $(grep -i "$key" text/*.txt \| awk -F ":" '{print $1}' \| sort -u \| sed 's\|\.txt\|\|g' \| sed 's\|text/\|\|g') )

	for file in "${files[@]}"
	do
	echo "In [$file]"
	pdfgrep -ni "$key" "$file"
	done