/* * Copyright Datalogics, Inc. 2015 */ package pdfjt.cookbook.document; import com.adobe.fontengine.font.Font; import com.adobe.internal.io.ByteReader; import com.adobe.internal.io.InputStreamByteReader; import com.adobe.pdfjt.pdf.document.PDFDocument; import com.adobe.pdfjt.pdf.document.PDFOpenOptions; import com.adobe.pdfjt.pdf.graphics.font.PDFFont; import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum; import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationRedaction; import com.adobe.pdfjt.pdf.page.PDFPage; import com.adobe.pdfjt.services.ap.AppearanceService; import com.adobe.pdfjt.services.ap.spi.APContext; import com.adobe.pdfjt.services.ap.spi.APResources; import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor; import com.adobe.pdfjt.services.redaction.RedactionOptions; import com.adobe.pdfjt.services.redaction.RedactionService; import com.adobe.pdfjt.services.textextraction.Word; import com.adobe.pdfjt.services.textextraction.WordsIterator; import com.datalogics.pdf.document.FontSetLoader; import com.datalogics.pdf.samples.util.IoUtils; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.EnumSet; import java.util.HashMap; /** * Searches for phone numbers and redacts them. */ public class SearchAndRedactUsingRegEx { private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/SearchAndRedactUsingRegEx_Input.pdf"; static public void main(String[] args) throws Exception { // First read in the PDF file URLConnection connection = new URL(inputPDFURL).openConnection(); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); connection.connect(); InputStream fis = connection.getInputStream(); ByteReader byteReader = new InputStreamByteReader(fis); PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); // Then get the first (and only) page in the file. We'll need this object in order to add annotations to it. PDFPage pdfPageOne = pdfDocument.requirePages().getPage(0); ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet()); WordsIterator wordsIterator = textExtractor.getWordsIterator(); System.out.println("Phone Numbers Found:"); while (wordsIterator.hasNext()) { Word word = wordsIterator.next(); // Look for phone number pattern ###-###-#### if (word.toString().matches("\\d{3}[-\\.\\s]\\d{3}[-\\.\\s]\\d{4}")) { System.out.println(word.toString()); /* * Create a new Redaction annotation and use the location * properties of the word to set the properties of the * annotation. * */ PDFAnnotationRedaction pdfAnnotationRedaction = PDFAnnotationRedaction.newInstance(pdfDocument); pdfAnnotationRedaction.setQuadPoints(wordQuadsToAnnotQuads(word)); pdfAnnotationRedaction.setRect(pdfAnnotationRedaction.getRedactionAreaBBox()); pdfAnnotationRedaction.setColor(new double[] { 1, 0, 0 }); // red pdfAnnotationRedaction.setInteriorColor(new double[] { 0, 0, 0 }); // black pdfPageOne.addAnnotation(pdfAnnotationRedaction); } } // Now create the appearances of the Redaction annotations APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(), pdfDocument.getCosDocument().getOptions().getDocLocale(), new HashMap<Font, PDFFont>()); APContext apContext = new APContext(apResources, true, null); apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Redact)); AppearanceService.generateAppearances(pdfDocument, apContext, null); // Apply the redactions RedactionService.applyRedaction(pdfDocument, new RedactionOptions(null), IoUtils.newByteWriter(IoUtils.createUrlFromPath("SearchAndRedactUsingRegEx_Output.pdf"))); System.out.println("Done!"); } public static double[] wordQuadsToAnnotQuads(Word word) throws Exception { double[] quadPoints = new double[8]; quadPoints[0] = word.getBoundingQuads().get(0).p1().x(); quadPoints[1] = word.getBoundingQuads().get(0).p1().y(); quadPoints[2] = word.getBoundingQuads().get(0).p2().x(); quadPoints[3] = word.getBoundingQuads().get(0).p2().y(); quadPoints[4] = word.getBoundingQuads().get(0).p3().x(); quadPoints[5] = word.getBoundingQuads().get(0).p3().y(); quadPoints[6] = word.getBoundingQuads().get(0).p4().x(); quadPoints[7] = word.getBoundingQuads().get(0).p4().y(); return quadPoints; } }