Last active
October 20, 2023 10:37
-
-
Save documentprocessing/c9066d385001538b9c1b48fdbc31e669 to your computer and use it in GitHub Desktop.
Explore PDF parsing features of PyMuPDF like extracting text, images & tables from PDF, inserting text into PDF or text recognition using OCR etc. Check https://products.documentprocessing.com/parser/python/pymupdf/ for more details.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import PyMuPDF | |
import fitz | |
# File path you want to extract images from | |
file = "data.pdf" | |
# Open the file | |
pdf_file = fitz.open(file) | |
# Iterate over PDF pages | |
for page_index in range(len(pdf_file)): | |
# Get the page itself | |
page = pdf_file[page_index] | |
# Get the image list for the page | |
image_list = page.get_images(full=True) | |
# Printing the number of images found on this page | |
if image_list: | |
print(f"[+] Found a total of {len(image_list)} images in page {page_index + 1}") | |
else: | |
print("[!] No images found on page", page_index + 1) | |
# Extract images from the page | |
for image_index, img in enumerate(image_list, start=1): | |
xref = img[0] | |
base_image = pdf_file.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_ext = base_image["ext"] | |
# Save the image to a file | |
image_filename = f"page_{page_index + 1}_image_{image_index}.{image_ext}" | |
with open(image_filename, "wb") as img_file: | |
img_file.write(image_bytes) | |
# Close the PDF document | |
pdf_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import package PyMuPDF | |
import fitz | |
# Open some document, for example a PDF (could also be EPUB, XPS, etc.) | |
doc = fitz.open("table_handling_example.pdf") | |
# Load a desired page. This works via 0-based numbers | |
page = doc[0] | |
# Look for tables on this page and display the table count | |
tabs = page.find_tables() | |
print(f"{len(tabs.tables)} table(s) on {page}") | |
# Select the first table | |
tab = tabs[0] | |
df = tab.to_pandas() | |
print("Table:") | |
print(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import PyMuPDF | |
import fitz | |
# Open a PDF file | |
pdf_document = "documentprocessing.pdf" | |
doc = fitz.open(pdf_document) | |
# Initialize an empty string to store extracted text | |
extracted_text = "" | |
# Iterate through each page and extract text | |
for page_num in range(doc.page_count): | |
page = doc[page_num] | |
extracted_text += page.get_text() | |
# Close the PDF document | |
doc.close() | |
# Perform text analysis (e.g., count words) | |
word_count = len(extracted_text.split()) | |
print(f"The Extracted text is as follows:\n{extracted_text}") | |
print(f"Total words in the document: {word_count}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import PyMuPDF | |
import fitz | |
# Open a PDF file | |
doc = fitz.open("documentprocessing.pdf") | |
page = doc[0] # Access the first page | |
# Define the starting point for the text | |
p = fitz.Point(75, 150) | |
# Define the text to be inserted | |
text = "Some text,\nspread across\nseveral lines." | |
# Insert the text on the page | |
rc = page.insert_text(p, text) | |
# Print the number of lines printed on the page | |
print("%i lines printed on page %i." % (rc, page.number)) | |
# Save the modified PDF to a new file | |
doc.save("text.pdf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import PyMuPDF | |
import fitz | |
import os | |
# Set the TESSDATA_PREFIX environment variable to the folder containing the language file | |
os.environ['TESSDATA_PREFIX'] = 'F://' | |
# Open the PDF file | |
pdf_document = fitz.open('data.pdf') | |
# Get the page from the PDF document | |
page_number = 1 | |
page = pdf_document[page_number - 1] | |
# Perform OCR using get_textpage_ocr | |
textpage_ocr = page.get_textpage_ocr(flags=3, language='eng', dpi=72, full=False, tessdata=None) | |
# Extract text from the OCR result | |
text = textpage_ocr.extractText() | |
# Print the OCR result | |
print(text) | |
# Close the PDF document | |
pdf_document.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment