Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/c9066d385001538b9c1b48fdbc31e669 to your computer and use it in GitHub Desktop.
Save documentprocessing/c9066d385001538b9c1b48fdbc31e669 to your computer and use it in GitHub Desktop.
Explore PDF parsing features of PyMuPDF like extracting text, images & tables from PDF, inserting text into PDF or text recognition using OCR etc. Check https://products.documentprocessing.com/parser/python/pymupdf/ for more details.
# Import PyMuPDF
import fitz
# File path you want to extract images from
file = "data.pdf"
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
# Get the page itself
page = pdf_file[page_index]
# Get the image list for the page
image_list = page.get_images(full=True)
# Printing the number of images found on this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index + 1}")
else:
print("[!] No images found on page", page_index + 1)
# Extract images from the page
for image_index, img in enumerate(image_list, start=1):
xref = img[0]
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Save the image to a file
image_filename = f"page_{page_index + 1}_image_{image_index}.{image_ext}"
with open(image_filename, "wb") as img_file:
img_file.write(image_bytes)
# Close the PDF document
pdf_file.close()
# import package PyMuPDF
import fitz
# Open some document, for example a PDF (could also be EPUB, XPS, etc.)
doc = fitz.open("table_handling_example.pdf")
# Load a desired page. This works via 0-based numbers
page = doc[0]
# Look for tables on this page and display the table count
tabs = page.find_tables()
print(f"{len(tabs.tables)} table(s) on {page}")
# Select the first table
tab = tabs[0]
df = tab.to_pandas()
print("Table:")
print(df)
# Import PyMuPDF
import fitz
# Open a PDF file
pdf_document = "documentprocessing.pdf"
doc = fitz.open(pdf_document)
# Initialize an empty string to store extracted text
extracted_text = ""
# Iterate through each page and extract text
for page_num in range(doc.page_count):
page = doc[page_num]
extracted_text += page.get_text()
# Close the PDF document
doc.close()
# Perform text analysis (e.g., count words)
word_count = len(extracted_text.split())
print(f"The Extracted text is as follows:\n{extracted_text}")
print(f"Total words in the document: {word_count}")
# Import PyMuPDF
import fitz
# Open a PDF file
doc = fitz.open("documentprocessing.pdf")
page = doc[0] # Access the first page
# Define the starting point for the text
p = fitz.Point(75, 150)
# Define the text to be inserted
text = "Some text,\nspread across\nseveral lines."
# Insert the text on the page
rc = page.insert_text(p, text)
# Print the number of lines printed on the page
print("%i lines printed on page %i." % (rc, page.number))
# Save the modified PDF to a new file
doc.save("text.pdf")
# Import PyMuPDF
import fitz
import os
# Set the TESSDATA_PREFIX environment variable to the folder containing the language file
os.environ['TESSDATA_PREFIX'] = 'F://'
# Open the PDF file
pdf_document = fitz.open('data.pdf')
# Get the page from the PDF document
page_number = 1
page = pdf_document[page_number - 1]
# Perform OCR using get_textpage_ocr
textpage_ocr = page.get_textpage_ocr(flags=3, language='eng', dpi=72, full=False, tessdata=None)
# Extract text from the OCR result
text = textpage_ocr.extractText()
# Print the OCR result
print(text)
# Close the PDF document
pdf_document.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment