Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/b618d397689c968fdd242383d11248ae to your computer and use it in GitHub Desktop.
Save documentprocessing/b618d397689c968fdd242383d11248ae to your computer and use it in GitHub Desktop.
Extract Text from PDF with PDF Clown Java API
// Import PDF Clown text extraction classes
import org.pdfclown.documents.Document;
import org.pdfclown.documents.Page;
import org.pdfclown.files.File;
import org.pdfclown.tools.TextExtractor;
public class ExtractTextFromPDF {
public static void main(String[] args) {
try {
// 1. Load an existing PDF file
File pdfFile = new File("input.pdf");
Document document = pdfFile.getDocument();
// 2. Initialize the text extractor
TextExtractor textExtractor = new TextExtractor();
// 3. Iterate through pages and extract text
System.out.println("=== Extracted Text ===");
for (Page page : document.getPages()) {
System.out.println("\n--- Page " + (page.getIndex() + 1) + " ---");
// Extract text with formatting metadata
TextExtractor.TextInfo textInfo = textExtractor.extract(page);
// Print text content (with position/font details)
for (TextExtractor.TextString textString : textInfo.getTextStrings()) {
System.out.println(
"Text: \"" + textString.getText() + "\"" +
" | Font: " + textString.getFont() +
" | Size: " + textString.getFontSize() +
" | Position: (" + textString.getBox().getX() + ", " + textString.getBox().getY() + ")"
);
}
}
System.out.println("\nText extraction completed!");
} catch (Exception e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment