Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JoelGeraci-Datalogics/2dec26205488610e88facde495b01e62 to your computer and use it in GitHub Desktop.
Save JoelGeraci-Datalogics/2dec26205488610e88facde495b01e62 to your computer and use it in GitHub Desktop.
Searches for a word, highlights it, then extracts all pages that contain that word
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import com.adobe.fontengine.font.Font;
import com.adobe.internal.io.ByteReader;
import com.adobe.internal.io.InputStreamByteReader;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.graphics.font.PDFFont;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationHighlight;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.pdf.page.PDFPageTree;
import com.adobe.pdfjt.services.ap.AppearanceService;
import com.adobe.pdfjt.services.ap.spi.APContext;
import com.adobe.pdfjt.services.ap.spi.APResources;
import com.adobe.pdfjt.services.manipulations.PMMOptions;
import com.adobe.pdfjt.services.manipulations.PMMService;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.Word;
import com.adobe.pdfjt.services.textextraction.WordsIterator;
import com.datalogics.pdf.document.DocumentHelper;
import com.datalogics.pdf.document.FontSetLoader;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
/**
* Searches for a word, highlights it, then extracts all pages that contain that word.
*/
public class ExtractPagesMatchingSearchTerms {
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/PDF32000_2008.pdf";
private static final String outputDir = "cookbook/Document/output/";
static public void main(String[] args) throws Exception {
// First read in the PDF file
URLConnection connection = new URL(inputPDFURL).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
connection.connect();
InputStream fis = connection.getInputStream();
ByteReader byteReader = new InputStreamByteReader(fis);
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance());
PDFPageTree pdfPageTree = pdfDocument.requirePages();
// This List will hold the pages that need to be extracted
List<PDFPage> pagesToExtract = new ArrayList<>();
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet());
WordsIterator wordsIterator = textExtractor.getWordsIterator();
System.out.println("Pages Extracted:");
while (wordsIterator.hasNext()) {
Word word = wordsIterator.next();
if (word.toString().toLowerCase().contains("javascript")) {
System.out.println(word.getPageNumber());
/*
* Create a new Highlight annotation and use the location
* properties of the word to set the properties of the
* annotation.
*/
PDFAnnotationHighlight pdfAnnotationHighlight = PDFAnnotationHighlight.newInstance(pdfDocument);
pdfAnnotationHighlight.setQuadPoints(wordQuadsToAnnotQuads(word));
pdfAnnotationHighlight.setColor(new double[] { 1, 0.819611, 0 }); // yellow to match Acrobat Highlights
PDFPage pdfPage = pdfPageTree.getPage(word.getPageNumber()-1);
pdfPage.addAnnotation(pdfAnnotationHighlight);
// Add the page to the list of pages to extract if it's not already there.
if (pagesToExtract.contains(pdfPage) == false) {
pagesToExtract.add(pdfPage);
}
}
}
// Now create the appearances of the Highlight annotations
APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(),
pdfDocument.getCosDocument().getOptions().getDocLocale(),
new HashMap<Font, PDFFont>());
APContext apContext = new APContext(apResources, true, null);
apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Highlight));
AppearanceService.generateAppearances(pdfDocument, apContext, null);
// Now extract the pages
PDFPage[] pages = new PDFPage[pagesToExtract.size()];
pages = pagesToExtract.toArray(pages);
PMMService pmmService = new PMMService(pdfDocument);
PDFDocument extractedPages = pmmService.extractPages(pages,PMMOptions.newInstance(PMMOptions.AnnotationsForms), PDFOpenOptions.newInstance());
// Save and close
DocumentHelper.saveFullAndClose(extractedPages, outputDir+"JavaScriptPages.pdf");
System.out.println("Done!");
}
public static double[] wordQuadsToAnnotQuads(Word word) throws Exception {
double[] quadPoints = new double[8];
quadPoints[0] = word.getBoundingQuads().get(0).p4().x();
quadPoints[1] = word.getBoundingQuads().get(0).p4().y();
quadPoints[2] = word.getBoundingQuads().get(0).p3().x();
quadPoints[3] = word.getBoundingQuads().get(0).p3().y();
quadPoints[4] = word.getBoundingQuads().get(0).p1().x();
quadPoints[5] = word.getBoundingQuads().get(0).p1().y();
quadPoints[6] = word.getBoundingQuads().get(0).p2().x();
quadPoints[7] = word.getBoundingQuads().get(0).p2().y();
return quadPoints;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment