Skip to content

Instantly share code, notes, and snippets.

@ashwanthkumar
Last active July 16, 2016 03:39
Show Gist options
  • Save ashwanthkumar/b6ed746019474a575c99de243e039098 to your computer and use it in GitHub Desktop.
Save ashwanthkumar/b6ed746019474a575c99de243e039098 to your computer and use it in GitHub Desktop.
Simple PDFTextExtractor
import java.io.File
import org.apache.pdfbox.cos.COSDocument
import org.apache.pdfbox.io.{RandomAccessBuffer, RandomAccessFile}
import org.apache.pdfbox.pdfparser.PDFParser
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
object PDFTextExtractor extends App {
//val reader = new PDFReader(new File("my.pdf"))
var pdfStripper: PDFTextStripper = null
var pdDoc: PDDocument = null
var cosDoc: COSDocument = null
val file = new File("/home/master/Desktop/RDD.pdf")
val parser:PDFParser = new PDFParser(new RandomAccessFile(file, "r"))
// val parser:PDFParser = new PDFParser(new RandomAccessBuffer(PDFTextExtractor.getClass.getResourceAsStream("/pdf-sample.pdf")))
parser.parse()
cosDoc = parser.getDocument()
pdfStripper = new PDFTextStripper()
pdDoc = new PDDocument(cosDoc)
pdfStripper.setStartPage(1)
pdfStripper.setEndPage(5)
var parsedText = pdfStripper.getText(pdDoc)
println(parsedText)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment