Skip to content

Instantly share code, notes, and snippets.

@lebedov
Created April 28, 2021 12:29
Show Gist options
  • Save lebedov/3518142a5c2431b8c9a28d323100558a to your computer and use it in GitHub Desktop.
Save lebedov/3518142a5c2431b8c9a28d323100558a to your computer and use it in GitHub Desktop.
How to use pdfbox's PDFTextStripper class in Python.
#!/usr/bin/env python3
"""
How to use pdfbox's PDFTextStripper class in Python.
"""
import pathlib
import pkg_resources
import re
import urllib.request
import appdirs
import jpype
import jpype.imports
import numpy as np
# Replace with path to pdfbox jar file:
a = appdirs.AppDirs('python-pdfbox')
cache_dir = pathlib.Path(a.user_cache_dir)
file_list = list(cache_dir.glob('pdfbox-app-*.jar'))
def f(s):
v = re.search('pdfbox-app-([\w\.\-]+)\.jar', s.name).group(1)
return pkg_resources.parse_version(v)
jpype.addClassPath(sorted(file_list, key=f)[-1])
if not jpype.isJVMStarted():
jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.awt.headless=true', convertStrings=False)
from java.awt.image import BufferedImage
from java.io import File
from org.apache.pdfbox.pdmodel import PDDocument
from org.apache.pdfbox.text import PDFTextStripper
def extract_text(in_file):
"""
Extract text of PDF file.
Parameters
----------
in_file : str
Path to input PDF file.
Returns
-------
text : str
Extracted text.
"""
doc = PDDocument.load(File(in_file))
pdf_text_stripper = PDFTextStripper()
text = pdf_text_stripper.getText(doc)
return str(text)
if __name__ == '__main__':
import os
import tempfile
import urllib
# Download sample multipage PDF:
data = urllib.request.urlopen('https://researchtorevenue.files.wordpress.com/2015/04/1r41ai10801601_fong.pdf').read()
fd, name = tempfile.mkstemp()
f = open(name, 'wb')
f.write(data)
f.close()
result = extract_text(name)
os.unlink(name)
@sfinotti
Copy link

@lebedov Thank you again. You nailed it !!! The file was corrupted I downloaded it again and it's working now. Thank you !!!

@mara004
Copy link

mara004 commented Jun 22, 2023

Nice example, almost looks easier than using the CLI!

@mrtj
Copy link

mrtj commented Feb 11, 2025

For PDFBox 3.0 they've split the loading API from PDDocument, so use:

from org.apache.pdfbox import Loader

...
doc = Loader.loadPDF(File(in_file))
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment