hdclark · November 5, 2011 20:16
diff --git a/pdf_grep.sh b/pdf_grep.sh
 #!/bin/bash

 ###########################################################################################
 # This script is used to comb a pdf file for a specific phrase. It is bad, inefficient
 #  script, but I haven't been able to figure out how to properly do it yet (and I've been
 #  looking for several months - combing S.O. and freshmeat/freecode for proper index/search
 #  solutions.)
 #
 # Usage: 
 #      Ensure script is executable:       chmod +x /path/to/pdf_grep.sh
 #      and then:
 #        find /path/to/pdfs/ -type f -exec /path/to/pdf_grep.sh '{}' 'harmonic oscillator' \; 
 #
 # Dependencies:
 #
 #   -find       (replace with locate at your leisure.)
 #   -grep       (or a suitable replacement.)  
 #   -pdftotext  (comes with the poppler library, or pdf-utils in some distros.)
 #
 # Caveats:
 #
 # The pdf text is grepped, so be CAREFUL WITH LONG PHRASES which might be split across two
 #  lines. The output is a line of text which matches the pattern. 
 #
 # How should you use this script? Search "en masse" through a collection of documents. If
 #  any interesting leads appear, follow up with a proper search in a proper pdf viewer. If 
 #  anything, this script will help narrow down searches accross libraries of texts.
 #
 # Adapted from a (gross) script at 
 #  http://stackoverflow.com/questions/4643438/how-to-search-contents-of-multiple-pdf-files 
 #  by hal clark in November of 2011.
 ###########################################################################################


 # First argument is the pdf file.
 THEFILE="$1"
 shift

 # All other terms are the search terms. Spaces will remain intact.
 THETERMS="$@"

 # PDFtoTEXT options.
 PDFTOTEXTOPTS=" -q -layout -enc ASCII7 " # quiet, maintain layout, specify encoding.

 # Grep options.
 # output line numbers, ignore case, color when possible,
 # contextual lines printed.
 GREPOPTS=" -n -i --color=auto -C 2 "


 # File existence check.
 if [ ! -f "${THEFILE}" ] ; then
    echo "File ${THEFILE} was not found. Exiting."
    exit
 fi

 # Begin the actual searching. We have to convert the entire document to text 
 # before searching, so grab some popcorn or break out the knitting needles.
 echo "### Searching for phrase \"${THETERMS}\" in file \"${THEFILE}\"."
 pdftotext ${PDFTOTEXTOPTS}  "${THEFILE}"  -  |  grep ${GREPOPTS}  "${THETERMS}"


 # Output the total number of lines of text to give some reference to the output. Will 
 # probably double execution time...
 # echo "Total lines in document: $( pdftotext ${PDFTOTEXTOPTS}  "${THEFILE}"  -  |  wc -l )"
	#!/bin/bash

	###########################################################################################
	# This script is used to comb a pdf file for a specific phrase. It is bad, inefficient
	# script, but I haven't been able to figure out how to properly do it yet (and I've been
	# looking for several months - combing S.O. and freshmeat/freecode for proper index/search
	# solutions.)
	#
	# Usage:
	# Ensure script is executable: chmod +x /path/to/pdf_grep.sh
	# and then:
	# find /path/to/pdfs/ -type f -exec /path/to/pdf_grep.sh '{}' 'harmonic oscillator' \;
	#
	# Dependencies:
	#
	# -find (replace with locate at your leisure.)
	# -grep (or a suitable replacement.)
	# -pdftotext (comes with the poppler library, or pdf-utils in some distros.)
	#
	# Caveats:
	#
	# The pdf text is grepped, so be CAREFUL WITH LONG PHRASES which might be split across two
	# lines. The output is a line of text which matches the pattern.
	#
	# How should you use this script? Search "en masse" through a collection of documents. If
	# any interesting leads appear, follow up with a proper search in a proper pdf viewer. If
	# anything, this script will help narrow down searches accross libraries of texts.
	#
	# Adapted from a (gross) script at
	# http://stackoverflow.com/questions/4643438/how-to-search-contents-of-multiple-pdf-files
	# by hal clark in November of 2011.
	###########################################################################################


	# First argument is the pdf file.
	THEFILE="$1"
	shift

	# All other terms are the search terms. Spaces will remain intact.
	THETERMS="$@"

	# PDFtoTEXT options.
	PDFTOTEXTOPTS=" -q -layout -enc ASCII7 " # quiet, maintain layout, specify encoding.

	# Grep options.
	# output line numbers, ignore case, color when possible,
	# contextual lines printed.
	GREPOPTS=" -n -i --color=auto -C 2 "


	# File existence check.
	if [ ! -f "${THEFILE}" ] ; then
	echo "File ${THEFILE} was not found. Exiting."
	exit
	fi

	# Begin the actual searching. We have to convert the entire document to text
	# before searching, so grab some popcorn or break out the knitting needles.
	echo "### Searching for phrase \"${THETERMS}\" in file \"${THEFILE}\"."
	pdftotext ${PDFTOTEXTOPTS} "${THEFILE}" - \| grep ${GREPOPTS} "${THETERMS}"


	# Output the total number of lines of text to give some reference to the output. Will
	# probably double execution time...
	# echo "Total lines in document: $( pdftotext ${PDFTOTEXTOPTS} "${THEFILE}" - \| wc -l )"