ttscoff · July 2, 2012 09:04
diff --git a/pdfsearch.rb b/pdfsearch.rb
 #!/usr/bin/env ruby
 =begin
 PDFSearch (OSX-only[^1]) by Brett Terpstra 2012
 requires pdf-reader <https://github.com/yob/pdf-reader>
 `gem install pdf-reader`

 * Searches for the string(s) passed as arguments in PDF files located by Spotlight
 * Quote arguments to search exact phrase, unquoted arguments are fuzzy
 * Returns a Markdown definition list to STDOUT with links to matched files and page numbers matched
 * Progress reported to STDERR, output can be redirected silently.

 Now I just need to make this a reusable class.

 Usage example: pdfsearch twitter > results.md

 [^1]: The point of this wasn't to search for PDFs, though, just to work out some details of command line progress reports. Check 'em out, just modify the spotlight (mdfind) command to something more appropriate.
 =end

 require 'rubygems'
 require 'pdf-reader'

 $status = STDERR

 Signal.trap("SIGINT") do
  update_status("\nInterrupted...",{ :last => true })
  exit
 end

 # Write a string to STDERR
 def update_status(update,options = {})

  last = options[:last] || false

  # Get the terminal width using *nix `tput` command running every time to try to handle resizing windows
  cols = %x{tput cols}.strip.to_i - 5
  # trim output so it doesn't break to a second line
  update = update.slice(0,cols) if update.length > cols
  # if it's not the last output, use a carriage return instead of a newline as terminator
  terminator = last ? "\n" : "\r"
  # Print to STDERR
  $status.printf("\033[K%s%s",update,terminator)

  $status.flush if last
 end

 def do_scan(terms)

  output = ""
  scanned = 0
  results = 0
  # Status bar is on STDERR so updates are realtime, not buffered
  search_string = terms.join(' ')
  re_string = terms.join('[^\n]*?')

  update_status("Searching for pdfs...")
  res = %x[mdfind -onlyin ~ -interpret \'"#{search_string}" kind:pdf\'].split("\n")
  total = res.length

  if total > 50
    STDOUT.print("Scan #{total} files? (y/N): ")
    reply = STDIN.gets
    exit unless reply =~ /^y/i
  end
  progarr = [0,10,20,30,40,50,60,70,80,90,100]

  res.each_with_index do |file, count|
    # Overall progress bar and status
    overall_percent = (count * 100 / total).ceil
    overall_progress = progarr.select{|item| item <= overall_percent }.max
    overall_progbar = "[" + "="*(overall_progress/10) + " "*(10-(overall_progress/10)) + "]"
    message = "#{overall_progbar} PDF #{count}/#{total}"
    filename = File.basename(file)
    update_status(message + "... [#{filename}]")
    pdf = PDF::Reader.new(file)
    out = []
    page_count = pdf.pages.length
    unless page_count > 30
      pdf.pages.each_with_index { |p, i|
        # Individual file progress bar
        page_percent = (i * 100 / page_count).ceil
        page_progress = progarr.select{|item| item <= page_percent }.max
        page_progbar = "[" + "="*(page_progress/10) + " "*(10-(page_progress/10)) + "]"
        update_status("#{message}, #{page_progbar} pg #{i}/#{page_count}... #{out.length} matches [#{filename}]")
        out.push(i) if p.text.gsub(/\n+/,' ') =~ /#{re_string}/is
      }
      unless out.empty?
        out.map! { |page|
          (page + 1).to_s
        }
        output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n"
        output += out.length == 1 ? ": Page " + out[0] + "\n\n" : ": Pages " + out.join(", ") + "\n\n"
        results += out.length
      end
      scanned += 1
    else
      update_status("Skipped #{filename} (too many pages [#{page_count}])", {:last => true})
      output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n"
      output += ": Page scan aborted, too many pages (#{page_count})\n\n"
    end
  end
  [output,total,scanned,results]
 end

 output, total, scanned, results = do_scan(ARGV)
 update_status("Scanned #{scanned.to_s}/#{total} files, #{results} pages contained keywords",{:last => true})
 puts output
	#!/usr/bin/env ruby
	=begin
	PDFSearch (OSX-only[^1]) by Brett Terpstra 2012
	requires pdf-reader <https://github.com/yob/pdf-reader>
	`gem install pdf-reader`

	* Searches for the string(s) passed as arguments in PDF files located by Spotlight
	* Quote arguments to search exact phrase, unquoted arguments are fuzzy
	* Returns a Markdown definition list to STDOUT with links to matched files and page numbers matched
	* Progress reported to STDERR, output can be redirected silently.

	Now I just need to make this a reusable class.

	Usage example: pdfsearch twitter > results.md

	[^1]: The point of this wasn't to search for PDFs, though, just to work out some details of command line progress reports. Check 'em out, just modify the spotlight (mdfind) command to something more appropriate.
	=end

	require 'rubygems'
	require 'pdf-reader'

	$status = STDERR

	Signal.trap("SIGINT") do
	update_status("\nInterrupted...",{ :last => true })
	exit
	end

	# Write a string to STDERR
	def update_status(update,options = {})

	last = options[:last] \|\| false

	# Get the terminal width using *nix `tput` command running every time to try to handle resizing windows
	cols = %x{tput cols}.strip.to_i - 5
	# trim output so it doesn't break to a second line
	update = update.slice(0,cols) if update.length > cols
	# if it's not the last output, use a carriage return instead of a newline as terminator
	terminator = last ? "\n" : "\r"
	# Print to STDERR
	$status.printf("\033[K%s%s",update,terminator)

	$status.flush if last
	end

	def do_scan(terms)

	output = ""
	scanned = 0
	results = 0
	# Status bar is on STDERR so updates are realtime, not buffered
	search_string = terms.join(' ')
	re_string = terms.join('[^\n]*?')

	update_status("Searching for pdfs...")
	res = %x[mdfind -onlyin ~ -interpret \'"#{search_string}" kind:pdf\'].split("\n")
	total = res.length

	if total > 50
	STDOUT.print("Scan #{total} files? (y/N): ")
	reply = STDIN.gets
	exit unless reply =~ /^y/i
	end
	progarr = [0,10,20,30,40,50,60,70,80,90,100]

	res.each_with_index do \|file, count\|
	# Overall progress bar and status
	overall_percent = (count * 100 / total).ceil
	overall_progress = progarr.select{\|item\| item <= overall_percent }.max
	overall_progbar = "[" + "="(overall_progress/10) + " "(10-(overall_progress/10)) + "]"
	message = "#{overall_progbar} PDF #{count}/#{total}"
	filename = File.basename(file)
	update_status(message + "... [#{filename}]")
	pdf = PDF::Reader.new(file)
	out = []
	page_count = pdf.pages.length
	unless page_count > 30
	pdf.pages.each_with_index { \|p, i\|
	# Individual file progress bar
	page_percent = (i * 100 / page_count).ceil
	page_progress = progarr.select{\|item\| item <= page_percent }.max
	page_progbar = "[" + "="(page_progress/10) + " "(10-(page_progress/10)) + "]"
	update_status("#{message}, #{page_progbar} pg #{i}/#{page_count}... #{out.length} matches [#{filename}]")
	out.push(i) if p.text.gsub(/\n+/,' ') =~ /#{re_string}/is
	}
	unless out.empty?
	out.map! { \|page\|
	(page + 1).to_s
	}
	output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n"
	output += out.length == 1 ? ": Page " + out[0] + "\n\n" : ": Pages " + out.join(", ") + "\n\n"
	results += out.length
	end
	scanned += 1
	else
	update_status("Skipped #{filename} (too many pages [#{page_count}])", {:last => true})
	output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n"
	output += ": Page scan aborted, too many pages (#{page_count})\n\n"
	end
	end
	[output,total,scanned,results]
	end

	output, total, scanned, results = do_scan(ARGV)
	update_status("Scanned #{scanned.to_s}/#{total} files, #{results} pages contained keywords",{:last => true})
	puts output