Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/9077db188c7dc5d239c086b2fd02dbf1 to your computer and use it in GitHub Desktop.
Save documentprocessing/9077db188c7dc5d239c086b2fd02dbf1 to your computer and use it in GitHub Desktop.
Extract Text and Font Information from PDF documents in Python using pdfminer.six Library. Check https://products.documentprocessing.com/parser/python/pdfminer.six/ for more details.
# Import required classes from the pdfminer.six library
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
# Open the PDF file
with open('documentprocessing.pdf', 'rb') as pdf_file:
# Create a PDFParser
pdf_parser = PDFParser(pdf_file)
# Create a PDFDocument
pdf_document = PDFDocument(pdf_parser)
# Create a PDFResourceManager
pdf_resource_manager = PDFResourceManager()
# Create a PDFDevice
pdf_device = PDFPageAggregator(pdf_resource_manager)
# Create a PDFPageInterpreter
pdf_page_interpreter = PDFPageInterpreter(pdf_resource_manager, pdf_device)
# Initialize a set to store unique font information
unique_fonts = set()
# Iterate through the pages in the PDF
for page in PDFPage.create_pages(pdf_document):
pdf_page_interpreter.process_page(page)
layout = pdf_device.get_result()
# Iterate through layout elements
for element in layout:
if hasattr(element, "fontname"):
font_name = element.fontname.split('+',1)
font_info = f"Font Name: {font_name[1]}"
if hasattr(element, "size"):
font_info += f", Font Size: {int(element.size)}"
# Check if we haven't seen this font before
if font_info not in unique_fonts:
print(font_info)
unique_fonts.add(font_info)
# Close the PDF file
pdf_file.close()
# Import extract_text function from the pdfminer.six library
from pdfminer.high_level import extract_text
# Specify the PDF file you want to extract text from
pdf_file = 'documentprocessing.pdf'
# Extract text from the PDF
text = extract_text(pdf_file)
# Removing any empty lines in the document
# Split the text into lines and filter out empty lines
lines = [line.strip() for line in text.splitlines() if line.strip()]
# Join the non-empty lines back together with newline characters
cleaned_text = '\n'.join(lines)
# Print the cleaned text
print(cleaned_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment