Last active
October 24, 2023 14:26
-
-
Save documentprocessing/9077db188c7dc5d239c086b2fd02dbf1 to your computer and use it in GitHub Desktop.
Extract Text and Font Information from PDF documents in Python using pdfminer.six Library. Check https://products.documentprocessing.com/parser/python/pdfminer.six/ for more details.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import required classes from the pdfminer.six library | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import PDFPageAggregator | |
# Open the PDF file | |
with open('documentprocessing.pdf', 'rb') as pdf_file: | |
# Create a PDFParser | |
pdf_parser = PDFParser(pdf_file) | |
# Create a PDFDocument | |
pdf_document = PDFDocument(pdf_parser) | |
# Create a PDFResourceManager | |
pdf_resource_manager = PDFResourceManager() | |
# Create a PDFDevice | |
pdf_device = PDFPageAggregator(pdf_resource_manager) | |
# Create a PDFPageInterpreter | |
pdf_page_interpreter = PDFPageInterpreter(pdf_resource_manager, pdf_device) | |
# Initialize a set to store unique font information | |
unique_fonts = set() | |
# Iterate through the pages in the PDF | |
for page in PDFPage.create_pages(pdf_document): | |
pdf_page_interpreter.process_page(page) | |
layout = pdf_device.get_result() | |
# Iterate through layout elements | |
for element in layout: | |
if hasattr(element, "fontname"): | |
font_name = element.fontname.split('+',1) | |
font_info = f"Font Name: {font_name[1]}" | |
if hasattr(element, "size"): | |
font_info += f", Font Size: {int(element.size)}" | |
# Check if we haven't seen this font before | |
if font_info not in unique_fonts: | |
print(font_info) | |
unique_fonts.add(font_info) | |
# Close the PDF file | |
pdf_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import extract_text function from the pdfminer.six library | |
from pdfminer.high_level import extract_text | |
# Specify the PDF file you want to extract text from | |
pdf_file = 'documentprocessing.pdf' | |
# Extract text from the PDF | |
text = extract_text(pdf_file) | |
# Removing any empty lines in the document | |
# Split the text into lines and filter out empty lines | |
lines = [line.strip() for line in text.splitlines() if line.strip()] | |
# Join the non-empty lines back together with newline characters | |
cleaned_text = '\n'.join(lines) | |
# Print the cleaned text | |
print(cleaned_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment