Last active
September 1, 2024 19:50
-
-
Save hanjianwei/6838974 to your computer and use it in GitHub Desktop.
Extract title from pdf file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Extract title from PDF file. | |
Depends on: pyPDF, PDFMiner. | |
Usage: | |
find . -name "*.pdf" | xargs -I{} pdftitle -d tmp --rename {} | |
""" | |
import cStringIO | |
import getopt | |
import os | |
import re | |
import string | |
import sys | |
from pyPdf import PdfFileReader | |
from pyPdf.utils import PdfReadError | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfinterp import PDFResourceManager, process_pdf, PDFTextExtractionNotAllowed | |
from pdfminer.pdfparser import PDFSyntaxError | |
__all__ = ['pdf_title'] | |
def sanitize(filename): | |
"""Turn string to valid file name. | |
""" | |
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) | |
return ''.join([c for c in filename if c in valid_chars]) | |
def meta_title(filename): | |
"""Title from pdf metadata. | |
""" | |
try: | |
docinfo = PdfFileReader(file(filename, 'rb')).getDocumentInfo() | |
return docinfo.title if docinfo.title else "" | |
except PdfReadError: | |
return "" | |
def copyright_line(line): | |
"""Judge if a line is copyright info. | |
""" | |
return re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission', line.lower()) | |
def empty_str(s): | |
return len(s.strip()) == 0 | |
def pdf_text(filename): | |
try: | |
text = cStringIO.StringIO() | |
rsrc = PDFResourceManager() | |
device = TextConverter(rsrc, text, codec='utf-8', laparams=LAParams()) | |
process_pdf(rsrc, device, file(filename, 'rb'), None, maxpages=1, password='') | |
device.close() | |
return text.getvalue() | |
except (PDFSyntaxError, PDFTextExtractionNotAllowed): | |
return "" | |
def title_start(lines): | |
for i, line in enumerate(lines): | |
if not empty_str(line) and not copyright_line(line): | |
return i; | |
return 0 | |
def title_end(lines, start, max_lines=2): | |
for i, line in enumerate(lines[start+1:start+max_lines+1], start+1): | |
if empty_str(line): | |
return i | |
return start + 1 | |
def text_title(filename): | |
"""Extract title from PDF's text. | |
""" | |
lines = pdf_text(filename).strip().split('\n') | |
i = title_start(lines) | |
j = title_end(lines, i) | |
return ' '.join(line.strip() for line in lines[i:j]) | |
def valid_title(title): | |
return not empty_str(title) and empty_str(os.path.splitext(title)[1]) | |
def pdf_title(filename): | |
title = meta_title(filename) | |
if valid_title(title): | |
return title | |
title = text_title(filename) | |
if valid_title(title): | |
return title | |
return os.path.basename(os.path.splitext(filename)[0]) | |
if __name__ == "__main__": | |
opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename']) | |
dry_run = False | |
rename = False | |
dir = "." | |
for opt, arg in opts: | |
if opt in ['-n', '--dry-run']: | |
dry_run = True | |
elif opt in ['--rename']: | |
rename = True | |
elif opt in ['-d']: | |
dir = arg | |
if len(args) == 0: | |
print "Usage: %s [-d output] [--dry-run] [--rename] filenames" % sys.argv[0] | |
sys.exit(1) | |
for filename in args: | |
title = pdf_title(filename) | |
if rename: | |
new_name = os.path.join(dir, sanitize(' '.join(title.split())) + ".pdf") | |
print "%s => %s" % (filename, new_name) | |
if not dry_run: | |
if os.path.exists(new_name): | |
print "*** Target %s already exists! ***" % new_name | |
else: | |
os.rename(filename, new_name) | |
else: | |
print title |
I have updated this script in my fork. Main changes:
- Support for new API of PDFMiner
- Implementation of font parsing, as suggested by @khoanguyen8496. Although it introduces new false positives, overall it is able to extract more valid titles.
I've tested it with around 670 pdfs, and around 40 titles were unable to be extracted. Most of these pdfs followed the structure of a typical scientific article.
Any alternative for this command to use it in Windows ?
find . -name "*.pdf" | xargs -I{} pdftitle -d tmp --rename {}
Hello, thank you very much for this, it is very usefull.
I am getting the same error again and again:
raise Exception("PDF contains a unicode char that does not " +
Exception: PDF contains a unicode char that does not exist in the font
I tried lots of things related to encoding-decoding my text in the pdf's but nothing seems to work.
Do you have any idea why I'm getting this error?
Thank you!
Alexandra
I've updated this script to use work with the currently maintained PDFMiner 3 and pdfminer-six.
pip uninstall pdfminer
pip install pypdf pdfminer-six
Environment info
pip list | grep -e pdfminer -e pypdf
pdfminer.six 20221105
pypdf 3.7.0
python -V
Python 3.10.8
Script
#!/usr/bin/env python
"""
Extract title from PDF file.
Depends on: pyPDF, PDFMiner.
Usage:
find . -name "*.pdf" | xargs -I{} pdftitle -d tmp --rename {}
"""
from io import StringIO
import getopt
import os
import re
import string
import sys
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfparser import PDFSyntaxError
__all__ = ['pdf_title']
def sanitize(filename):
"""Turn string to valid file name.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join([c for c in filename if c in valid_chars])
def meta_title(filename):
"""Title from pdf metadata.
"""
try:
with open(filename, 'rb') as file:
docinfo = PdfReader(file).metadata
return docinfo.title if docinfo.title else ""
except PdfReadError:
return ""
def copyright_line(line):
"""Judge if a line is copyright info.
"""
return re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission', line.lower())
def empty_str(s):
return len(s.strip()) == 0
def pdf_text(filename):
try:
text = StringIO.StringIO()
rsrc = PDFResourceManager()
device = TextConverter(rsrc, text, codec='utf-8', laparams=LAParams())
with open(filename,'rb') as file:
PDFPage.get_pages(rsrc, device, file, None, maxpages=1, password='')
device.close()
return text.getvalue()
except (PDFSyntaxError, PDFTextExtractionNotAllowed):
return ""
def title_start(lines):
for i, line in enumerate(lines):
if not empty_str(line) and not copyright_line(line):
return i;
return 0
def title_end(lines, start, max_lines=2):
for i, line in enumerate(lines[start+1:start+max_lines+1], start+1):
if empty_str(line):
return i
return start + 1
def text_title(filename):
"""Extract title from PDF's text.
"""
lines = pdf_text(filename).strip().split('\n')
i = title_start(lines)
j = title_end(lines, i)
return ' '.join(line.strip() for line in lines[i:j])
def valid_title(title):
return not empty_str(title) and empty_str(os.path.splitext(title)[1])
def pdf_title(filename):
title = meta_title(filename)
if valid_title(title):
return title
title = text_title(filename)
if valid_title(title):
return title
return os.path.basename(os.path.splitext(filename)[0])
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename'])
dry_run = False
rename = False
dir = "."
for opt, arg in opts:
if opt in ['-n', '--dry-run']:
dry_run = True
elif opt in ['--rename']:
rename = True
elif opt in ['-d']:
dir = arg
if len(args) == 0:
print("Usage: %s [-d output] [--dry-run] [--rename] filenames" % sys.argv[0])
sys.exit(1)
for filename in args:
title = pdf_title(filename)
if rename:
new_name = os.path.join(dir, sanitize(' '.join(title.split())) + ".pdf")
print("%s => %s" % (filename, new_name))
if not dry_run:
if os.path.exists(new_name):
print("*** Target %s already exists! ***" % new_name)
else:
os.rename(filename, new_name)
else:
print(title)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi hanjianwei,
I am working on the same problem. Have you ever considered adding font and style information instead of just raw text information to the program? Can you tell me the approx. accuracy of your approach?