Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/c7b1c8cce28a52996c1940700b393cc6 to your computer and use it in GitHub Desktop.
Save documentprocessing/c7b1c8cce28a52996c1940700b393cc6 to your computer and use it in GitHub Desktop.
Extract text, images and attachments from PDF files in Python using pypdf Library. Check https://products.documentprocessing.com/parser/python/pypdf/ for the details.
# Import the PdfReader class from the pypdf library
from pypdf import PdfReader
# Open a PDF file
reader = PdfReader("data.pdf")
# Iterate through the attachments in the PDF
for name, content_list in reader.attachments:
# Iterate through the contents in each attachment
for i, content in enumerate(content_list):
# Open a new file for writing the attachment content
with open(f"{name}-{i}", "wb") as fp:
fp.write(content)
# Import the PdfReader class from the pypdf library
from pypdf import PdfReader
# Open the PDF file
reader = PdfReader("data.pdf")
# Initialize a counter to keep track of the extracted images
count = 0
# Iterate through each page in the PDF document
for page in reader.pages:
# Iterate through the images on the current page
for image_file_object in page.images:
# Open and write the image data
with open(str(count) + image_file_object.name, "wb") as fp:
fp.write(image_file_object.data)
count += 1
# Import the PdfReader class from the pypdf library
from pypdf import PdfReader
# Open the PDF file"
reader = PdfReader("documentprocessing.pdf")
# Iterate through all the pages in the PDF document
for i in reader.pages:
# Extract and print the text content of each page
print(i.extract_text())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment