-
-
Save lebedov/3518142a5c2431b8c9a28d323100558a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
How to use pdfbox's PDFTextStripper class in Python. | |
""" | |
import pathlib | |
import pkg_resources | |
import re | |
import urllib.request | |
import appdirs | |
import jpype | |
import jpype.imports | |
import numpy as np | |
# Replace with path to pdfbox jar file: | |
a = appdirs.AppDirs('python-pdfbox') | |
cache_dir = pathlib.Path(a.user_cache_dir) | |
file_list = list(cache_dir.glob('pdfbox-app-*.jar')) | |
def f(s): | |
v = re.search('pdfbox-app-([\w\.\-]+)\.jar', s.name).group(1) | |
return pkg_resources.parse_version(v) | |
jpype.addClassPath(sorted(file_list, key=f)[-1]) | |
if not jpype.isJVMStarted(): | |
jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.awt.headless=true', convertStrings=False) | |
from java.awt.image import BufferedImage | |
from java.io import File | |
from org.apache.pdfbox.pdmodel import PDDocument | |
from org.apache.pdfbox.text import PDFTextStripper | |
def extract_text(in_file): | |
""" | |
Extract text of PDF file. | |
Parameters | |
---------- | |
in_file : str | |
Path to input PDF file. | |
Returns | |
------- | |
text : str | |
Extracted text. | |
""" | |
doc = PDDocument.load(File(in_file)) | |
pdf_text_stripper = PDFTextStripper() | |
text = pdf_text_stripper.getText(doc) | |
return str(text) | |
if __name__ == '__main__': | |
import os | |
import tempfile | |
import urllib | |
# Download sample multipage PDF: | |
data = urllib.request.urlopen('https://researchtorevenue.files.wordpress.com/2015/04/1r41ai10801601_fong.pdf').read() | |
fd, name = tempfile.mkstemp() | |
f = open(name, 'wb') | |
f.write(data) | |
f.close() | |
result = extract_text(name) | |
os.unlink(name) |
@sfinotti The script can't find the pdfbox jar file - you need to either copy the jar file to the indicated user cache directory (e.g., ~/Library/Caches/python-pdfbox
on MacOS or ~/.cache/python-pdfbox
on Linux) or update the first few lines to pass the appropriate path to jpype.addClassPath()
@lebedov Thank you for the prompt answer.
The jar file was already there:
finotti@H170:~/.cache/python-pdfbox$ ls -l total 32 -rwxrwxr-x 1 finotti finotti 28694 jun 27 10:26 pdfbox-app-2.0.26.jar finotti@H170:~/.cache/python-pdfbox$
Maybe the jar version ??
@sfinotti The file size looks suspiciously small - check that its SHA256 checksum satisfies what is listed on Apache's website using sha256sum
. If it doesn't match, try downloading the jar file again (the script works fine with 2.0.26 on my system).
@lebedov Thank you again. You nailed it !!! The file was corrupted I downloaded it again and it's working now. Thank you !!!
Nice example, almost looks easier than using the CLI!
Hi, I got the following error when trying to use use the code:
`Exception: Java Exception
The above exception was the direct cause of the following exception:
java.lang.ClassNotFoundException Traceback (most recent call last)
File ~/anaconda3/envs/py39/lib/python3.9/site-packages/jpype/imports.py:195, in _JImportLoader.find_spec(self, name, path, target)
193 try:
194 # Use forname because it give better diagnostics
--> 195 cls = _jpype._java_lang_Class.forName(name, True, _jpype.JPypeClassLoader)
197 # This code only is hit if an error was not thrown
java.lang.ClassNotFoundException: java.lang.ClassNotFoundException: org.apache
The above exception was the direct cause of the following exception:
ImportError Traceback (most recent call last)
Input In [39], in <cell line: 14>()
12 from java.awt.image import BufferedImage
13 from java.io import File
---> 14 from org.apache.pdfbox.pdmodel import PDDocument
15 from org.apache.pdfbox.text import PDFTextStripper
File ~/anaconda3/envs/py39/lib/python3.9/site-packages/jpype/imports.py:203, in _JImportLoader.find_spec(self, name, path, target)
201 # Not found is acceptable
202 except Exception as ex:
--> 203 raise ImportError("Failed to import '%s'" % name) from ex
205 # Import the java module
206 return _ModuleSpec(name, self)
ImportError: Failed to import 'org.apache'`