documentprocessing · October 20, 2023 10:37
diff --git a/extract-images-from-pdf-in-python-using-pymupdf-library.py b/extract-images-from-pdf-in-python-using-pymupdf-library.py
 # Import PyMuPDF
 import fitz

 # File path you want to extract images from
 file = "data.pdf"

 # Open the file
 pdf_file = fitz.open(file)

 # Iterate over PDF pages
 for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]

    # Get the image list for the page
    image_list = page.get_images(full=True)

    # Printing the number of images found on this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index + 1}")
    else:
        print("[!] No images found on page", page_index + 1)

    # Extract images from the page
    for image_index, img in enumerate(image_list, start=1):
        xref = img[0]
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save the image to a file
        image_filename = f"page_{page_index + 1}_image_{image_index}.{image_ext}"
        with open(image_filename, "wb") as img_file:
            img_file.write(image_bytes)

 # Close the PDF document
 pdf_file.close()
diff --git a/extract-tables-from-pdf-in-python-using-pymupdf-library.py b/extract-tables-from-pdf-in-python-using-pymupdf-library.py
 # import package PyMuPDF
 import fitz  

 # Open some document, for example a PDF (could also be EPUB, XPS, etc.)
 doc = fitz.open("table_handling_example.pdf")

 # Load a desired page. This works via 0-based numbers
 page = doc[0]

 # Look for tables on this page and display the table count
 tabs = page.find_tables()
 print(f"{len(tabs.tables)} table(s) on {page}")

 # Select the first table
 tab = tabs[0]

 df = tab.to_pandas()
 print("Table:")
 print(df)
diff --git a/extract-text-from-pdf-in-python-using-pymupdf-library.py b/extract-text-from-pdf-in-python-using-pymupdf-library.py
 # Import PyMuPDF
 import fitz  

 # Open a PDF file
 pdf_document = "documentprocessing.pdf"
 doc = fitz.open(pdf_document)

 # Initialize an empty string to store extracted text
 extracted_text = ""

 # Iterate through each page and extract text
 for page_num in range(doc.page_count):
    page = doc[page_num]
    extracted_text += page.get_text()
    
 # Close the PDF document
 doc.close()

 # Perform text analysis (e.g., count words)
 word_count = len(extracted_text.split())
 print(f"The Extracted text is as follows:\n{extracted_text}")
 print(f"Total words in the document: {word_count}")
diff --git a/insert-text-into-pdf-in-python-using-pymupdf-library.py b/insert-text-into-pdf-in-python-using-pymupdf-library.py
 # Import PyMuPDF
 import fitz

 # Open a PDF file
 doc = fitz.open("documentprocessing.pdf")
 page = doc[0]  # Access the first page

 # Define the starting point for the text
 p = fitz.Point(75, 150)

 # Define the text to be inserted
 text = "Some text,\nspread across\nseveral lines."

 # Insert the text on the page
 rc = page.insert_text(p, text)

 # Print the number of lines printed on the page
 print("%i lines printed on page %i." % (rc, page.number))

 # Save the modified PDF to a new file
 doc.save("text.pdf")
diff --git a/pdf-text-recognition-in-python-using-pymupdf-library.py b/pdf-text-recognition-in-python-using-pymupdf-library.py
 # Import PyMuPDF
 import fitz
 import os

 # Set the TESSDATA_PREFIX environment variable to the folder containing the language file
 os.environ['TESSDATA_PREFIX'] = 'F://'

 # Open the PDF file
 pdf_document = fitz.open('data.pdf')

 # Get the page from the PDF document
 page_number = 1
 page = pdf_document[page_number - 1]

 # Perform OCR using get_textpage_ocr
 textpage_ocr = page.get_textpage_ocr(flags=3, language='eng', dpi=72, full=False, tessdata=None)

 # Extract text from the OCR result
 text = textpage_ocr.extractText()

 # Print the OCR result
 print(text)

 # Close the PDF document
 pdf_document.close()
	# Import PyMuPDF
	import fitz

	# File path you want to extract images from
	file = "data.pdf"

	# Open the file
	pdf_file = fitz.open(file)

	# Iterate over PDF pages
	for page_index in range(len(pdf_file)):
	# Get the page itself
	page = pdf_file[page_index]

	# Get the image list for the page
	image_list = page.get_images(full=True)

	# Printing the number of images found on this page
	if image_list:
	print(f"[+] Found a total of {len(image_list)} images in page {page_index + 1}")
	else:
	print("[!] No images found on page", page_index + 1)

	# Extract images from the page
	for image_index, img in enumerate(image_list, start=1):
	xref = img[0]
	base_image = pdf_file.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]

	# Save the image to a file
	image_filename = f"page_{page_index + 1}_image_{image_index}.{image_ext}"
	with open(image_filename, "wb") as img_file:
	img_file.write(image_bytes)

	# Close the PDF document
	pdf_file.close()
	# import package PyMuPDF
	import fitz

	# Open some document, for example a PDF (could also be EPUB, XPS, etc.)
	doc = fitz.open("table_handling_example.pdf")

	# Load a desired page. This works via 0-based numbers
	page = doc[0]

	# Look for tables on this page and display the table count
	tabs = page.find_tables()
	print(f"{len(tabs.tables)} table(s) on {page}")

	# Select the first table
	tab = tabs[0]

	df = tab.to_pandas()
	print("Table:")
	print(df)
	# Import PyMuPDF
	import fitz

	# Open a PDF file
	pdf_document = "documentprocessing.pdf"
	doc = fitz.open(pdf_document)

	# Initialize an empty string to store extracted text
	extracted_text = ""

	# Iterate through each page and extract text
	for page_num in range(doc.page_count):
	page = doc[page_num]
	extracted_text += page.get_text()

	# Close the PDF document
	doc.close()

	# Perform text analysis (e.g., count words)
	word_count = len(extracted_text.split())
	print(f"The Extracted text is as follows:\n{extracted_text}")
	print(f"Total words in the document: {word_count}")
	# Import PyMuPDF
	import fitz
	import os

	# Set the TESSDATA_PREFIX environment variable to the folder containing the language file
	os.environ['TESSDATA_PREFIX'] = 'F://'

	# Open the PDF file
	pdf_document = fitz.open('data.pdf')

	# Get the page from the PDF document
	page_number = 1
	page = pdf_document[page_number - 1]

	# Perform OCR using get_textpage_ocr
	textpage_ocr = page.get_textpage_ocr(flags=3, language='eng', dpi=72, full=False, tessdata=None)

	# Extract text from the OCR result
	text = textpage_ocr.extractText()

	# Print the OCR result
	print(text)

	# Close the PDF document
	pdf_document.close()