documentprocessing · June 10, 2025 06:44
diff --git a/extract-text-from-pdf-using-pdfclown.java b/extract-text-from-pdf-using-pdfclown.java
 // Import PDF Clown text extraction classes
 import org.pdfclown.documents.Document;
 import org.pdfclown.documents.Page;
 import org.pdfclown.files.File;
 import org.pdfclown.tools.TextExtractor;

 public class ExtractTextFromPDF {
  public static void main(String[] args) {
    try {
      // 1. Load an existing PDF file
      File pdfFile = new File("input.pdf");
      Document document = pdfFile.getDocument();
      
      // 2. Initialize the text extractor
      TextExtractor textExtractor = new TextExtractor();
      
      // 3. Iterate through pages and extract text
      System.out.println("=== Extracted Text ===");
      for (Page page : document.getPages()) {
        System.out.println("\n--- Page " + (page.getIndex() + 1) + " ---");
        
        // Extract text with formatting metadata
        TextExtractor.TextInfo textInfo = textExtractor.extract(page);
        
        // Print text content (with position/font details)
        for (TextExtractor.TextString textString : textInfo.getTextStrings()) {
          System.out.println(
            "Text: \"" + textString.getText() + "\"" +
            " | Font: " + textString.getFont() +
            " | Size: " + textString.getFontSize() +
            " | Position: (" + textString.getBox().getX() + ", " + textString.getBox().getY() + ")"
          );
        }
      }
      
      System.out.println("\nText extraction completed!");
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
 }
	// Import PDF Clown text extraction classes
	import org.pdfclown.documents.Document;
	import org.pdfclown.documents.Page;
	import org.pdfclown.files.File;
	import org.pdfclown.tools.TextExtractor;

	public class ExtractTextFromPDF {
	public static void main(String[] args) {
	try {
	// 1. Load an existing PDF file
	File pdfFile = new File("input.pdf");
	Document document = pdfFile.getDocument();

	// 2. Initialize the text extractor
	TextExtractor textExtractor = new TextExtractor();

	// 3. Iterate through pages and extract text
	System.out.println("=== Extracted Text ===");
	for (Page page : document.getPages()) {
	System.out.println("\n--- Page " + (page.getIndex() + 1) + " ---");

	// Extract text with formatting metadata
	TextExtractor.TextInfo textInfo = textExtractor.extract(page);

	// Print text content (with position/font details)
	for (TextExtractor.TextString textString : textInfo.getTextStrings()) {
	System.out.println(
	"Text: \"" + textString.getText() + "\"" +
	" \| Font: " + textString.getFont() +
	" \| Size: " + textString.getFontSize() +
	" \| Position: (" + textString.getBox().getX() + ", " + textString.getBox().getY() + ")"
	);
	}
	}

	System.out.println("\nText extraction completed!");
	} catch (Exception e) {
	e.printStackTrace();
	}
	}
	}