Created
May 21, 2025 12:59
-
-
Save documentprocessing/0a79744ff24115ab33b9c8bb7cc80acc to your computer and use it in GitHub Desktop.
Text Extraction from PDF using PDFBox in Java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import java.io.File; | |
import java.io.IOException; | |
public class PDFTextExtractor { | |
public static void main(String[] args) { | |
// Path to your PDF file | |
String filePath = "sample.pdf"; | |
try (PDDocument document = PDDocument.load(new File(filePath))) { | |
// 1. Create PDFTextStripper instance | |
PDFTextStripper textStripper = new PDFTextStripper(); | |
// 2. Configure extraction settings (optional) | |
textStripper.setSortByPosition(true); // Maintains physical layout | |
textStripper.setStartPage(1); // First page to extract | |
textStripper.setEndPage(document.getNumberOfPages()); // Last page | |
// 3. Extract text from entire document | |
String text = textStripper.getText(document); | |
// 4. Output results | |
System.out.println("=== Extracted Text ==="); | |
System.out.println(text); | |
// 5. (Optional) Save to text file | |
// Files.writeString(Path.of("extracted.txt"), text); | |
} catch (IOException e) { | |
System.err.println("Error extracting text: " + e.getMessage()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment