Created
October 23, 2016 22:26
-
-
Save JoelGeraci-Datalogics/c0b57d4f9a8c2a219f88f2f9227271e8 to your computer and use it in GitHub Desktop.
Search and Redact a PDF Using RegEx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import com.adobe.fontengine.font.Font; | |
import com.adobe.internal.io.ByteReader; | |
import com.adobe.internal.io.InputStreamByteReader; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.graphics.font.PDFFont; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationEnum; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationRedaction; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.services.ap.AppearanceService; | |
import com.adobe.pdfjt.services.ap.spi.APContext; | |
import com.adobe.pdfjt.services.ap.spi.APResources; | |
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor; | |
import com.adobe.pdfjt.services.redaction.RedactionOptions; | |
import com.adobe.pdfjt.services.redaction.RedactionService; | |
import com.adobe.pdfjt.services.textextraction.Word; | |
import com.adobe.pdfjt.services.textextraction.WordsIterator; | |
import com.datalogics.pdf.document.FontSetLoader; | |
import com.datalogics.pdf.samples.util.IoUtils; | |
import java.io.InputStream; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.EnumSet; | |
import java.util.HashMap; | |
/** | |
* Searches for phone numbers and redacts them. | |
*/ | |
public class SearchAndRedactUsingRegEx { | |
private static final String inputPDFURL = "http://dev.datalogics.com/cookbook/document/SearchAndRedactUsingRegEx_Input.pdf"; | |
static public void main(String[] args) throws Exception { | |
// First read in the PDF file | |
URLConnection connection = new URL(inputPDFURL).openConnection(); | |
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); | |
connection.connect(); | |
InputStream fis = connection.getInputStream(); | |
ByteReader byteReader = new InputStreamByteReader(fis); | |
PDFDocument pdfDocument = PDFDocument.newInstance(byteReader, PDFOpenOptions.newInstance()); | |
// Then get the first (and only) page in the file. We'll need this object in order to add annotations to it. | |
PDFPage pdfPageOne = pdfDocument.requirePages().getPage(0); | |
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, FontSetLoader.newInstance().getFontSet()); | |
WordsIterator wordsIterator = textExtractor.getWordsIterator(); | |
System.out.println("Phone Numbers Found:"); | |
while (wordsIterator.hasNext()) { | |
Word word = wordsIterator.next(); | |
// Look for phone number pattern ###-###-#### | |
if (word.toString().matches("\\d{3}[-\\.\\s]\\d{3}[-\\.\\s]\\d{4}")) { | |
System.out.println(word.toString()); | |
/* | |
* Create a new Redaction annotation and use the location | |
* properties of the word to set the properties of the | |
* annotation. | |
* | |
*/ | |
PDFAnnotationRedaction pdfAnnotationRedaction = PDFAnnotationRedaction.newInstance(pdfDocument); | |
pdfAnnotationRedaction.setQuadPoints(wordQuadsToAnnotQuads(word)); | |
pdfAnnotationRedaction.setRect(pdfAnnotationRedaction.getRedactionAreaBBox()); | |
pdfAnnotationRedaction.setColor(new double[] { 1, 0, 0 }); // red | |
pdfAnnotationRedaction.setInteriorColor(new double[] { 0, 0, 0 }); // black | |
pdfPageOne.addAnnotation(pdfAnnotationRedaction); | |
} | |
} | |
// Now create the appearances of the Redaction annotations | |
APResources apResources = new APResources(pdfDocument.getCosDocument().getOptions().getFontSet(), | |
pdfDocument.getCosDocument().getOptions().getDocLocale(), | |
new HashMap<Font, PDFFont>()); | |
APContext apContext = new APContext(apResources, true, null); | |
apContext.setAnnotationsToBeProcessed(EnumSet.of(PDFAnnotationEnum.Redact)); | |
AppearanceService.generateAppearances(pdfDocument, apContext, null); | |
// Apply the redactions | |
RedactionService.applyRedaction(pdfDocument, | |
new RedactionOptions(null), | |
IoUtils.newByteWriter(IoUtils.createUrlFromPath("SearchAndRedactUsingRegEx_Output.pdf"))); | |
System.out.println("Done!"); | |
} | |
public static double[] wordQuadsToAnnotQuads(Word word) throws Exception { | |
double[] quadPoints = new double[8]; | |
quadPoints[0] = word.getBoundingQuads().get(0).p1().x(); | |
quadPoints[1] = word.getBoundingQuads().get(0).p1().y(); | |
quadPoints[2] = word.getBoundingQuads().get(0).p2().x(); | |
quadPoints[3] = word.getBoundingQuads().get(0).p2().y(); | |
quadPoints[4] = word.getBoundingQuads().get(0).p3().x(); | |
quadPoints[5] = word.getBoundingQuads().get(0).p3().y(); | |
quadPoints[6] = word.getBoundingQuads().get(0).p4().x(); | |
quadPoints[7] = word.getBoundingQuads().get(0).p4().y(); | |
return quadPoints; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment