Created
July 2, 2012 09:04
-
-
Save ttscoff/3032150 to your computer and use it in GitHub Desktop.
Locate PDFs containing a string via Spotlight and return page numbers for pages containing search string.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
=begin | |
PDFSearch (OSX-only[^1]) by Brett Terpstra 2012 | |
requires pdf-reader <https://github.com/yob/pdf-reader> | |
`gem install pdf-reader` | |
* Searches for the string(s) passed as arguments in PDF files located by Spotlight | |
* Quote arguments to search exact phrase, unquoted arguments are fuzzy | |
* Returns a Markdown definition list to STDOUT with links to matched files and page numbers matched | |
* Progress reported to STDERR, output can be redirected silently. | |
Now I just need to make this a reusable class. | |
Usage example: pdfsearch twitter > results.md | |
[^1]: The point of this wasn't to search for PDFs, though, just to work out some details of command line progress reports. Check 'em out, just modify the spotlight (mdfind) command to something more appropriate. | |
=end | |
require 'rubygems' | |
require 'pdf-reader' | |
$status = STDERR | |
Signal.trap("SIGINT") do | |
update_status("\nInterrupted...",{ :last => true }) | |
exit | |
end | |
# Write a string to STDERR | |
def update_status(update,options = {}) | |
last = options[:last] || false | |
# Get the terminal width using *nix `tput` command running every time to try to handle resizing windows | |
cols = %x{tput cols}.strip.to_i - 5 | |
# trim output so it doesn't break to a second line | |
update = update.slice(0,cols) if update.length > cols | |
# if it's not the last output, use a carriage return instead of a newline as terminator | |
terminator = last ? "\n" : "\r" | |
# Print to STDERR | |
$status.printf("\033[K%s%s",update,terminator) | |
$status.flush if last | |
end | |
def do_scan(terms) | |
output = "" | |
scanned = 0 | |
results = 0 | |
# Status bar is on STDERR so updates are realtime, not buffered | |
search_string = terms.join(' ') | |
re_string = terms.join('[^\n]*?') | |
update_status("Searching for pdfs...") | |
res = %x[mdfind -onlyin ~ -interpret \'"#{search_string}" kind:pdf\'].split("\n") | |
total = res.length | |
if total > 50 | |
STDOUT.print("Scan #{total} files? (y/N): ") | |
reply = STDIN.gets | |
exit unless reply =~ /^y/i | |
end | |
progarr = [0,10,20,30,40,50,60,70,80,90,100] | |
res.each_with_index do |file, count| | |
# Overall progress bar and status | |
overall_percent = (count * 100 / total).ceil | |
overall_progress = progarr.select{|item| item <= overall_percent }.max | |
overall_progbar = "[" + "="*(overall_progress/10) + " "*(10-(overall_progress/10)) + "]" | |
message = "#{overall_progbar} PDF #{count}/#{total}" | |
filename = File.basename(file) | |
update_status(message + "... [#{filename}]") | |
pdf = PDF::Reader.new(file) | |
out = [] | |
page_count = pdf.pages.length | |
unless page_count > 30 | |
pdf.pages.each_with_index { |p, i| | |
# Individual file progress bar | |
page_percent = (i * 100 / page_count).ceil | |
page_progress = progarr.select{|item| item <= page_percent }.max | |
page_progbar = "[" + "="*(page_progress/10) + " "*(10-(page_progress/10)) + "]" | |
update_status("#{message}, #{page_progbar} pg #{i}/#{page_count}... #{out.length} matches [#{filename}]") | |
out.push(i) if p.text.gsub(/\n+/,' ') =~ /#{re_string}/is | |
} | |
unless out.empty? | |
out.map! { |page| | |
(page + 1).to_s | |
} | |
output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n" | |
output += out.length == 1 ? ": Page " + out[0] + "\n\n" : ": Pages " + out.join(", ") + "\n\n" | |
results += out.length | |
end | |
scanned += 1 | |
else | |
update_status("Skipped #{filename} (too many pages [#{page_count}])", {:last => true}) | |
output += "[" + File.basename(file) + "](file://#{file.gsub(" ",'%20')})\n" | |
output += ": Page scan aborted, too many pages (#{page_count})\n\n" | |
end | |
end | |
[output,total,scanned,results] | |
end | |
output, total, scanned, results = do_scan(ARGV) | |
update_status("Scanned #{scanned.to_s}/#{total} files, #{results} pages contained keywords",{:last => true}) | |
puts output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment