Created
September 14, 2012 12:06
-
-
Save addisaden/3721563 to your computer and use it in GitHub Desktop.
search keywords in a lot of Pdfs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "pdf-reader" | |
dateiname = [] | |
print "Bitte geben Sie die zu suchenden Worte ein: " | |
words = gets.strip.split(/\s+/mi).collect { |w| | |
dateiname << w | |
Regexp.new(w.to_s, Regexp::MULTILINE + Regexp::IGNORECASE) | |
} | |
pdfs = {} | |
Dir["*.pdf"].each { |filename| | |
pdf = PDF::Reader.new(filename) | |
pdfs[filename] = {} | |
pdf.pages.each { |page| | |
pdfs[filename][page.number] = page.text | |
} | |
puts "\"#{ filename }\" importiert." | |
} | |
auswerten = "" | |
pdfs.each { |filename, pdf| | |
pdf.each { |number, text| | |
words.each { |w| | |
if text =~ w then | |
a = "#{ w.inspect } :: #{ filename } :: Seite #{ number }\n" | |
auswerten += a | |
puts a | |
end | |
} | |
} | |
auswerten += "\n" | |
puts "" | |
} | |
File.open("auswertung_#{ dateiname.join('_')}.txt", "w") { |f| | |
f.write auswerten | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "pdf-reader" | |
dateiname = [] | |
print "Bitte geben Sie die zu suchenden Worte ein: " | |
words = gets.strip.split(/\s+/mi).collect { |w| | |
dateiname << w | |
Regexp.new(w.to_s, Regexp::MULTILINE + Regexp::IGNORECASE) | |
} | |
pdfs = {} | |
Dir["*.pdf"].each { |filename| | |
pdf = PDF::Reader.new(filename) | |
pdfs[filename] = {} | |
pdf.pages.each { |page| | |
pdfs[filename][page.number] = page.text | |
} | |
puts "\"#{ filename }\" importiert." | |
} | |
auswerten = "" | |
pdfs.each { |filename, pdf| | |
pdf.each { |number, text| | |
words.each { |w| | |
if text =~ w then | |
a = "#{ w.inspect } :: #{ filename } :: Seite #{ number }\r\n" | |
ttt = text.gsub(/\s+/mi, ' ').split(/\s+/) | |
t_new = "" | |
ttt.each_index { |i| | |
i2 = (-5..5).to_a | |
if ttt[i] =~ w then | |
t_new += '> ' | |
i2.each { |ii| | |
ii += i | |
if ii >= 0 and ii < ttt.length then | |
t_new += ttt[ii] + ' ' | |
end | |
} | |
t_new += "\r\n" | |
end | |
} | |
a += t_new + "\r\n\r\n" | |
auswerten += a | |
puts a | |
end | |
} | |
} | |
auswerten += "\n" | |
puts "" | |
} | |
File.open("auswertung_mit_text___#{ dateiname.join('_')}.txt", "w") { |f| | |
f.write auswerten | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment