documentprocessing · October 24, 2023 14:26
diff --git a/extract-font-information-from-pdf-document-in-python-using-pdfminersix-library.py b/extract-font-information-from-pdf-document-in-python-using-pdfminersix-library.py
 # Import required classes from the pdfminer.six library
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator

 # Open the PDF file
 with open('documentprocessing.pdf', 'rb') as pdf_file:
  
    # Create a PDFParser
    pdf_parser = PDFParser(pdf_file)

    # Create a PDFDocument
    pdf_document = PDFDocument(pdf_parser)

    # Create a PDFResourceManager
    pdf_resource_manager = PDFResourceManager()

    # Create a PDFDevice
    pdf_device = PDFPageAggregator(pdf_resource_manager)

    # Create a PDFPageInterpreter
    pdf_page_interpreter = PDFPageInterpreter(pdf_resource_manager, pdf_device)

    # Initialize a set to store unique font information
    unique_fonts = set()

    # Iterate through the pages in the PDF
    for page in PDFPage.create_pages(pdf_document):
        pdf_page_interpreter.process_page(page)

        layout = pdf_device.get_result()

        # Iterate through layout elements
        for element in layout:
            if hasattr(element, "fontname"):
                font_name = element.fontname.split('+',1)
                font_info = f"Font Name: {font_name[1]}"
                if hasattr(element, "size"):
                    font_info += f", Font Size: {int(element.size)}"
                    
                # Check if we haven't seen this font before
                if font_info not in unique_fonts:
                    print(font_info)
                    unique_fonts.add(font_info)

 # Close the PDF file
 pdf_file.close()
diff --git a/extract-text-from-pdf-document-in-python-using-pdfminersix-library.py b/extract-text-from-pdf-document-in-python-using-pdfminersix-library.py
 # Import extract_text function from the pdfminer.six library
 from pdfminer.high_level import extract_text

 # Specify the PDF file you want to extract text from
 pdf_file = 'documentprocessing.pdf'

 # Extract text from the PDF
 text = extract_text(pdf_file)

 # Removing any empty lines in the document
 # Split the text into lines and filter out empty lines
 lines = [line.strip() for line in text.splitlines() if line.strip()]

 # Join the non-empty lines back together with newline characters
 cleaned_text = '\n'.join(lines)

 # Print the cleaned text
 print(cleaned_text)
	# Import required classes from the pdfminer.six library
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import PDFPageAggregator

	# Open the PDF file
	with open('documentprocessing.pdf', 'rb') as pdf_file:

	# Create a PDFParser
	pdf_parser = PDFParser(pdf_file)

	# Create a PDFDocument
	pdf_document = PDFDocument(pdf_parser)

	# Create a PDFResourceManager
	pdf_resource_manager = PDFResourceManager()

	# Create a PDFDevice
	pdf_device = PDFPageAggregator(pdf_resource_manager)

	# Create a PDFPageInterpreter
	pdf_page_interpreter = PDFPageInterpreter(pdf_resource_manager, pdf_device)

	# Initialize a set to store unique font information
	unique_fonts = set()

	# Iterate through the pages in the PDF
	for page in PDFPage.create_pages(pdf_document):
	pdf_page_interpreter.process_page(page)

	layout = pdf_device.get_result()

	# Iterate through layout elements
	for element in layout:
	if hasattr(element, "fontname"):
	font_name = element.fontname.split('+',1)
	font_info = f"Font Name: {font_name[1]}"
	if hasattr(element, "size"):
	font_info += f", Font Size: {int(element.size)}"

	# Check if we haven't seen this font before
	if font_info not in unique_fonts:
	print(font_info)
	unique_fonts.add(font_info)

	# Close the PDF file
	pdf_file.close()
	# Import extract_text function from the pdfminer.six library
	from pdfminer.high_level import extract_text

	# Specify the PDF file you want to extract text from
	pdf_file = 'documentprocessing.pdf'

	# Extract text from the PDF
	text = extract_text(pdf_file)

	# Removing any empty lines in the document
	# Split the text into lines and filter out empty lines
	lines = [line.strip() for line in text.splitlines() if line.strip()]

	# Join the non-empty lines back together with newline characters
	cleaned_text = '\n'.join(lines)

	# Print the cleaned text
	print(cleaned_text)