Skip to content

Instantly share code, notes, and snippets.

@addisaden
Created September 14, 2012 12:06
Show Gist options
  • Save addisaden/3721563 to your computer and use it in GitHub Desktop.
Save addisaden/3721563 to your computer and use it in GitHub Desktop.
search keywords in a lot of Pdfs
require "pdf-reader"
dateiname = []
print "Bitte geben Sie die zu suchenden Worte ein: "
words = gets.strip.split(/\s+/mi).collect { |w|
dateiname << w
Regexp.new(w.to_s, Regexp::MULTILINE + Regexp::IGNORECASE)
}
pdfs = {}
Dir["*.pdf"].each { |filename|
pdf = PDF::Reader.new(filename)
pdfs[filename] = {}
pdf.pages.each { |page|
pdfs[filename][page.number] = page.text
}
puts "\"#{ filename }\" importiert."
}
auswerten = ""
pdfs.each { |filename, pdf|
pdf.each { |number, text|
words.each { |w|
if text =~ w then
a = "#{ w.inspect } :: #{ filename } :: Seite #{ number }\n"
auswerten += a
puts a
end
}
}
auswerten += "\n"
puts ""
}
File.open("auswertung_#{ dateiname.join('_')}.txt", "w") { |f|
f.write auswerten
}
require "pdf-reader"
dateiname = []
print "Bitte geben Sie die zu suchenden Worte ein: "
words = gets.strip.split(/\s+/mi).collect { |w|
dateiname << w
Regexp.new(w.to_s, Regexp::MULTILINE + Regexp::IGNORECASE)
}
pdfs = {}
Dir["*.pdf"].each { |filename|
pdf = PDF::Reader.new(filename)
pdfs[filename] = {}
pdf.pages.each { |page|
pdfs[filename][page.number] = page.text
}
puts "\"#{ filename }\" importiert."
}
auswerten = ""
pdfs.each { |filename, pdf|
pdf.each { |number, text|
words.each { |w|
if text =~ w then
a = "#{ w.inspect } :: #{ filename } :: Seite #{ number }\r\n"
ttt = text.gsub(/\s+/mi, ' ').split(/\s+/)
t_new = ""
ttt.each_index { |i|
i2 = (-5..5).to_a
if ttt[i] =~ w then
t_new += '> '
i2.each { |ii|
ii += i
if ii >= 0 and ii < ttt.length then
t_new += ttt[ii] + ' '
end
}
t_new += "\r\n"
end
}
a += t_new + "\r\n\r\n"
auswerten += a
puts a
end
}
}
auswerten += "\n"
puts ""
}
File.open("auswertung_mit_text___#{ dateiname.join('_')}.txt", "w") { |f|
f.write auswerten
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment