Created
November 8, 2016 16:55
-
-
Save JoelGeraci-Datalogics/2dec26205488610e88facde495b01e62 to your computer and use it in GitHub Desktop.
Searches for a word, highlights it, then extracts all pages that contain that word
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.fontengine.font.Font; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.graphics.font.PDFFont; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationHighlight; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.pdf.page.PDFPageTree; | |
import com.adobe.pdfjt.services.ap.AppearanceService; | |
import com.adobe.pdfjt.services.ap.spi.APContext; | |
import com.adobe.pdfjt.services.ap.spi.APResources; | |
import com.adobe.pdfjt.services.manipulations.PMMOptions; | |
import com.adobe.pdfjt.services.manipulations.PMMService; | |
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor; | |
import com.adobe.pdfjt.services.textextraction.Word; | |
import com.adobe.pdfjt.services.textextraction.WordsIterator; | |
import com.datalogics.pdf.document.DocumentHelper; | |
import com.datalogics.pdf.document.FontSetLoader; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.EnumSet; | |
import java.util.HashMap; | |
import java.util.List; | |
/** | |
* Searches for a word, highlights it, then extracts all pages that contain that word. | |
*/ | |
public class ExtractPagesMatchingSearchTerms { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDF32000_2008.pdf"; | |
private static final String outputDir = "cookbook/Document/output/"; | |
static public void main(String[] args) throws Exception { | |
// First read in the PDF file | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
PDFPageTree pdfPageTree = pdfDocument.requirePages(); | |
// This List will hold the pages that need to be extracted | |
List<PDFPage> pagesToExtract = new ArrayList<>(); | |
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet()); | |
WordsIterator wordsIterator = textExtractor.getWordsIterator(); | |
System.out.println("Pages Extracted:"); | |
while (wordsIterator.hasNext()) { | |
Word word = wordsIterator.next(); | |
if (word.toString().toLowerCase().contains("javascript")) { | |
System.out.println(word.getPageNumber()); | |
/* | |
* Create a new Highlight annotation and use the location | |
* properties of the word to set the properties of the | |
* annotation. | |
*/ | |
PDFAnnotationHighlight pdfAnnotationHighlight = PDFAnnotationHighlight.newInstance(pdfDocument); | |
pdfAnnotationHighlight.setQuadPoints(wordQuadsToAnnotQuads(word)); | |
pdfAnnotationHighlight.setColor(new double[] { 1, 0.819611, 0 }); // yellow to match Acrobat Highlights | |
PDFPage pdfPage = pdfPageTree.getPage(word.getPageNumber()-1); | |
pdfPage.addAnnotation(pdfAnnotationHighlight); | |
// Add the page to the list of pages to extract if it's not already there. | |
if (pagesToExtract.contains(pdfPage) == false) { | |
pagesToExtract.add(pdfPage); | |
} | |
} | |
} | |
// Now create the appearances of the Highlight annotations | |
APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(), | |
pdfDocument.getCosDocument().getOptions().getDocLocale(), | |
new HashMap<Font, PDFFont>()); | |
APContext apContext = new APContext(apResources, true, null); | |
apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Highlight)); | |
AppearanceService.generateAppearances(pdfDocument, apContext, null); | |
// Now extract the pages | |
PDFPage[] pages = new PDFPage[pagesToExtract.size()]; | |
pages = pagesToExtract.toArray(pages); | |
PMMService pmmService = new PMMService(pdfDocument); | |
PDFDocument extractedPages = pmmService.extractPages(pages,PMMOptions.newInstance(PMMOptions.AnnotationsForms), PDFOpenOptions.newInstance()); | |
// Save and close | |
DocumentHelper.saveFullAndClose(extractedPages, outputDir+"JavaScriptPages.pdf"); | |
System.out.println("Done!"); | |
} | |
public static double[] wordQuadsToAnnotQuads(Word word) throws Exception { | |
double[] quadPoints = new double[8]; | |
quadPoints[0] = word.getBoundingQuads().get(0).p4().x(); | |
quadPoints[1] = word.getBoundingQuads().get(0).p4().y(); | |
quadPoints[2] = word.getBoundingQuads().get(0).p3().x(); | |
quadPoints[3] = word.getBoundingQuads().get(0).p3().y(); | |
quadPoints[4] = word.getBoundingQuads().get(0).p1().x(); | |
quadPoints[5] = word.getBoundingQuads().get(0).p1().y(); | |
quadPoints[6] = word.getBoundingQuads().get(0).p2().x(); | |
quadPoints[7] = word.getBoundingQuads().get(0).p2().y(); | |
return quadPoints; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment