Last active
January 8, 2024 03:08
-
-
Save i000313/6372210 to your computer and use it in GitHub Desktop.
Java: Class to get PDF annotations using the itextpdf lib. #java, #pdf, #annotations, #pdf-annotations, #itextpdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.itextpdf; | |
import com.itextpdf.text.Rectangle; | |
import com.itextpdf.text.pdf.PdfArray; | |
import com.itextpdf.text.pdf.PdfDictionary; | |
import com.itextpdf.text.pdf.PdfName; | |
import com.itextpdf.text.pdf.PdfReader; | |
import com.itextpdf.text.pdf.PdfString; | |
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener; | |
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy; | |
import com.itextpdf.text.pdf.parser.PdfTextExtractor; | |
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter; | |
import com.itextpdf.text.pdf.parser.RenderFilter; | |
import com.itextpdf.text.pdf.parser.TextExtractionStrategy; | |
import java.io.IOException; | |
/** | |
* Class for getting PDF annotations using the itextpdf lib (tested whith itextpdf-5.4.3). | |
* At the moment, this class allows to get highlighted text, underlined text, and | |
* text in comments. | |
* <br/> | |
* Document management — Portable document format — Part 1: PDF 1.7 | |
* http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf | |
* | |
* @author PSantos | |
*/ | |
public class PDFAnnotations { | |
public static void main(String args[]) { | |
try { | |
// Reads and parses a PDF document | |
PdfReader reader = new PdfReader("C:\\Users\\PSantos\\Desktop\\lixo\\exemplo.pdf"); | |
// For each PDF page | |
for (int i = 1; i <= reader.getNumberOfPages(); i++) { | |
// Get a page a PDF page | |
PdfDictionary page = reader.getPageN(i); | |
// Get all the annotations of page i | |
PdfArray annotsArray = page.getAsArray(PdfName.ANNOTS); | |
// If page does not have annotations | |
if (page.getAsArray(PdfName.ANNOTS) == null) { | |
continue; | |
} | |
// For each annotation | |
for (int j = 0; j < annotsArray.size(); ++j) { | |
// For current annotation | |
PdfDictionary curAnnot = annotsArray.getAsDict(j); | |
// Check the annotation subtype and print its text if not null | |
writeAnnotation(curAnnot, reader, i); | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* Check the annotation subtype and print its text. | |
* | |
* @param annot annotation to write. | |
* @param reader pdf document containing the annotation. | |
* @param pageNumber pdf page number containing the annotation. | |
* @throws IOException | |
*/ | |
public static void writeAnnotation(PdfDictionary annot, PdfReader reader, int pageNumber) throws IOException { | |
if(annot == null) { | |
return; | |
} | |
// System.out.print(annot.get(PdfName.SUBTYPE)); | |
// System.out.print(" -> Rect: " + annot.get(PdfName.RECT)); | |
PdfString text = null; | |
boolean mayHaveTextAnnotated = false; | |
// Highlights with comments (balloons) and Highliths | |
if (PdfName.HIGHLIGHT.equals(annot.get(PdfName.SUBTYPE))) { | |
// Only Highlights with comments may have text | |
text = (PdfString) annot.get(PdfName.CONTENTS); | |
mayHaveTextAnnotated = true; | |
} else if (PdfName.UNDERLINE.equals(annot.get(PdfName.SUBTYPE))) { | |
text = annot.getAsString(PdfName.CONTENTS); | |
mayHaveTextAnnotated = true; | |
// A comment (a balloon with a comment) | |
} else if (PdfName.TEXT.equals(annot.get(PdfName.SUBTYPE))) { | |
text = annot.getAsString(PdfName.CONTENTS); | |
} else { | |
text = annot.getAsString(PdfName.CONTENTS); | |
} | |
if(text != null) { | |
System.out.println(" -> " + text); | |
} | |
if(mayHaveTextAnnotated) { | |
PdfArray rectangle = (PdfArray) annot.get(PdfName.RECT); // ex: [82.1569, 757.575, 124.395, 769.305] | |
String textHighlighted = getTextFromRectangle(rectangle, reader, pageNumber); | |
if(textHighlighted != null) { | |
System.out.println(" Annotated text -> " + textHighlighted); | |
} | |
} | |
} | |
/** | |
* Extracts the text {@code rectangle}, located on page {@code pageNumber} | |
* of the pdf {@code reader}. | |
* <p>The text extracted by this method is not perfect. Usually extracts an | |
* unnecessary (not desirable) extra character. For instance, for an annotations | |
* like "[This is an annotated] text", the method will extract | |
* "This is an annotated t". </p> The extra character may appear before or | |
* after the annotated text. | |
* | |
* @param reader | |
* @param pageNumber | |
* @param rectangle ex: [82.1569, 757.575, 124.395, 769.305] | |
* @return the extracted text or null. | |
* @throws IOException | |
*/ | |
public static String getTextFromRectangle(PdfArray rectangle, | |
PdfReader reader, int pageNumber) throws IOException { | |
if(rectangle == null) { | |
return null; | |
} | |
// Get the retangle coodinates | |
float llx = rectangle.getAsNumber(0).floatValue(); | |
float lly = rectangle.getAsNumber(1).floatValue(); | |
float urx = rectangle.getAsNumber(2).floatValue(); | |
float ury = rectangle.getAsNumber(3).floatValue(); | |
Rectangle rect = new Rectangle(llx, lly, urx, ury); | |
RenderFilter filter = new RegionTextRenderFilter(rect); | |
TextExtractionStrategy strategy = | |
new FilteredTextRenderListener( | |
new LocationTextExtractionStrategy(), filter); | |
return PdfTextExtractor.getTextFromPage(reader, pageNumber, strategy); | |
} | |
} |
Glad to know. Thank you for let me know.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I have solved. I mistake signed to unsigned.
Thanks