Created
July 28, 2024 13:14
-
-
Save AkBKukU/acdbe5733ab97a38891cce2b7fe51e25 to your computer and use it in GitHub Desktop.
Lazy indexed local PDF search tool.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/env bash | |
# This script works by using pdftotext and pdfgrep along with standard grep to make a faster PDF searching tool. | |
# PDF searching is slow, searching through plain text is fast. This tool extracts all plain text and stores it seperately. | |
# When a search is done it is checked against the plain text first where a list of matching files are stored. | |
# It then searches only the matching PDFs to get the page numbers and print the snipit of data. | |
# TODO - Add loose matching for bad OCR by doing multiple passes replacing visually similar characters and putting white space in query. | |
key="$1" | |
dir_text="text" | |
# Make text dir | |
if [[ ! -d "$dir_text" ]] | |
then | |
mkdir "$dir_text" | |
fi | |
# Index all PDFs | |
for file in *.pdf | |
do | |
if [[ ! -f "$dir_text/$file.txt" ]] | |
then | |
pdftotext "$file" "$dir_text/$file.txt" | |
fi | |
done | |
files=( $(grep -i "$key" text/*.txt | awk -F ":" '{print $1}' | sort -u | sed 's|\.txt||g' | sed 's|text/||g') ) | |
for file in "${files[@]}" | |
do | |
echo "In [$file]" | |
pdfgrep -ni "$key" "$file" | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment