deepakprasanna · June 14, 2011 09:42
diff --git a/doc_pdf_crawler.rb b/doc_pdf_crawler.rb
 JARS_PATH = File.join(RAILS_ROOT, "lib/jars")
 Dir["#{JARS_PATH}/*jar"].each {|j| require j} #Suck the jars.

 require "java"

 import org.apache.pdfbox.pdfparser.PDFParser
 import org.apache.pdfbox.pdmodel.PDDocument
 import org.apache.pdfbox.util.PDFTextStripper
 import org.apache.poi.extractor.ExtractorFactory
 #importing all the required classes.


 #PDF parsing
 pdf_path = "sample.pdf"
 pdf_file=PDDocument.load pdf_path 
 pdf_text=PDFTextStripper.new.getText(doc)
 pdf_file.close
 puts pdf_text #text inside the pdf file


 #Doc/docx Parsing
 doc_path = "sample.doc"
 doc_file=java.io.FileInputStream.new(resume)
 extractor=ExtractorFactory.createExtractor(fs)
 doc_file.close
 puts extractor.text #text inside doc file


 puts "Whoa, im done" #super cool
 #Make sure to run this script using jruby and not the system ruby.
 #To execute the script, execute the command "ruby doc_pdf_crawler.rb" in your terminal.
	JARS_PATH = File.join(RAILS_ROOT, "lib/jars")
	Dir["#{JARS_PATH}/*jar"].each {\|j\| require j} #Suck the jars.

	require "java"

	import org.apache.pdfbox.pdfparser.PDFParser
	import org.apache.pdfbox.pdmodel.PDDocument
	import org.apache.pdfbox.util.PDFTextStripper
	import org.apache.poi.extractor.ExtractorFactory
	#importing all the required classes.


	#PDF parsing
	pdf_path = "sample.pdf"
	pdf_file=PDDocument.load pdf_path
	pdf_text=PDFTextStripper.new.getText(doc)
	pdf_file.close
	puts pdf_text #text inside the pdf file


	#Doc/docx Parsing
	doc_path = "sample.doc"
	doc_file=java.io.FileInputStream.new(resume)
	extractor=ExtractorFactory.createExtractor(fs)
	doc_file.close
	puts extractor.text #text inside doc file


	puts "Whoa, im done" #super cool
	#Make sure to run this script using jruby and not the system ruby.
	#To execute the script, execute the command "ruby doc_pdf_crawler.rb" in your terminal.
No results found