Skip to content

Instantly share code, notes, and snippets.

@aspose-com-gists
Last active March 20, 2025 06:41
Show Gist options
  • Save aspose-com-gists/8674c0f51022ba28947fb35f1c904b21 to your computer and use it in GitHub Desktop.
Save aspose-com-gists/8674c0f51022ba28947fb35f1c904b21 to your computer and use it in GitHub Desktop.
How to Parse PDFs in Python: A Step-by-Step Guide
# This code example shows how to extract text from all pages of a PDF document in Python
import aspose.pdf as ap
# Open PDF document
document = ap.Document("AddText.pdf")
# Create text absorber
text_absorber = ap.text.TextAbsorber()
# Call the accept method to process all pages
document.pages.accept(text_absorber)
# Retrieve the extracted text
extracted_text = text_absorber.text
# Define the file path
file_path = "extracted-text.txt"
# Open the file in write mode and write the extracted text
with open(file_path, "w", encoding="utf-8") as tw:
tw.write(extracted_text + "\n") # Write the extracted text with a newline
# This code example shows how to extract images from a PDF in Python
import aspose.pdf as ap
# Open document
document = ap.Document("Sample.pdf")
# Extract a particular image (first image from the first page)
x_image = document.pages[1].resources.images[1]
# Define the output image path
output_image_path = "OutputImage.jpg"
# Save the extracted image
with open(output_image_path, "wb") as output_image:
output_image.write(x_image.to_stream().read())
# This code example shows how to extract text from a specific page of a PDF document in Python
import aspose.pdf as ap
# Open PDF document
document = ap.Document("AddText.pdf")
# Create text absorber
text_absorber = ap.text.TextAbsorber()
# Call the accept method to process all pages
document.pages[1].accept(text_absorber)
# Retrieve the extracted text
extracted_text = text_absorber.text
# Define the file path
file_path = "extracted-text.txt"
# Open the file in write mode and write the extracted text
with open(file_path, "w", encoding="utf-8") as tw:
tw.write(extracted_text + "\n") # Write the extracted text with a newline
# This code example shows how to extract tables from a PDF document in Python
import aspose.pdf as ap
# Load PDF file
document = pdf.Document("sample.pdf")
# Process all pages
for page in document.pages:
# Initialize TableAbsorber object
absorber = ap.text.TableAbsorber()
# Identify tables on the current page
absorber.visit(page)
# Loop through extracted tables
for table in absorber.table_list:
# Iterate through all the rows in the table
for row in table.row_list:
# Iterate through all the columns in the row
for cell in row.cell_list:
# Fetch the text fragments
text_fragment_collection = cell.text_fragments
# Iterate through the text fragments
for fragment in text_fragment_collection:
# Print the text
print(fragment.text)
# This code example shows how to extract text from a multi-column PDF in Python
import io
import aspose.pdf as ap
# Open PDF document
document = ap.Document("multi-column-sample.pdf")
# Create TextFragmentAbsorber object to extract text
text_fragment_absorber = ap.text.TextFragmentAbsorber()
# Accept the absorber for the first page
document.pages.accept(text_fragment_absorber)
# Get the collection of extracted text fragments
text_fragment_collection = text_fragment_absorber.text_fragments
# Reduce font size by at least 70% to improve text extraction
for text_fragment in text_fragment_collection:
text_fragment.text_state.font_size *= 0.7
# Save the modified document to an in-memory stream
source_stream = io.BytesIO()
document.save(source_stream)
# Reload the document from the memory stream
source_stream.seek(0)
dest_document = ap.Document(source_stream)
# Initialize TextAbsorber to extract the updated text
text_absorber = ap.text.TextAbsorber()
dest_document.pages.accept(text_absorber)
extracted_text = text_absorber.text
# Save the extracted text to a file
with open("ExtractColumnsText_out.txt", "w", encoding="utf-8") as file:
file.write(extracted_text)
# This code example shows how to extract text from a specific region of a page in a PDF document using Python
import aspose.pdf as ap
# Open PDF document
document = ap.Document("sample.pdf")
# Create TextAbsorber object to extract text
absorber = ap.text.TextAbsorber()
absorber.text_search_options.limit_to_page_bounds = True
absorber.text_search_options.rectangle = ap.Rectangle(100, 200, 250, 350, True)
# Accept the absorber for the first page
document.pages[1].accept(absorber)
# Get the extracted text
extracted_text = absorber.text
# Define the file path
file_path = "extracted-text.txt"
# Open the file in write mode and write the extracted text
with open(file_path, "w", encoding="utf-8") as tw:
tw.write(extracted_text + "\n") # Write the extracted text with a newline
# This code example shows how to extract text from a specific region of a page in a PDF document using Python
import aspose.pdf as ap
# Open PDF document
document = ap.Document("sample.pdf")
# Initialize TextAbsorber with text extraction options
text_absorber = ap.text.TextAbsorber()
# Set extraction options
extraction_options = ap.text.TextExtractionOptions(ap.text.TextExtractionOptions.TextFormattingMode.PURE)
extraction_options.scale_factor = 0.5 # Adjusts text recognition for better column detection
text_absorber.extraction_options = extraction_options
# Extract text from the specified page
document.pages.accept(text_absorber)
# Get extracted text
extracted_text = text_absorber.text
# Save extracted text to a file
with open("ExtractTextUsingScaleFactor_out.txt", "w", encoding="utf-8") as file:
file.write(extracted_text)
# This code example shows how to extract file information in Python
import aspose.pdf as ap
# Load the PDF document
document = ap.Document("Sample.pdf")
# Retrieve document information
doc_info = document.info
# Display document metadata
print(f"Author: {doc_info.author}")
print(f"Creation Date: {doc_info.creation_date}")
print(f"Keywords: {doc_info.keywords}")
print(f"Modify Date: {doc_info.mod_date}")
print(f"Subject: {doc_info.subject}")
print(f"Title: {doc_info.title}")
import aspose.pdf as ap
# Load the PDF document
document = ap.Document("annotations.pdf")
# Loop through all annotations on the first page
for annotation in document.pages[1].annotations:
if annotation.annotation_type == ap.annotations.AnnotationType.HIGHLIGHT:
# Print annotation details
print(f"Title: {annotation.full_name}")
print(f"Annotation Rectangle: {annotation.rect}")
import aspose.pdf as ap
# Load the PDF document
document = ap.Document("annotations.pdf")
# Loop through all annotations on the first page
for annotation in document.pages[1].annotations:
if annotation.annotation_type == ap.annotations.AnnotationType.LINE:
# Print annotation details
print(f"Annotation Rectangle: {annotation.rect}")
import aspose.pdf as ap
# Load the PDF document
document = ap.Document("annotations.pdf")
# Loop through all annotations on the first page
for annotation in document.pages[1].annotations:
if annotation.annotation_type == ap.annotations.AnnotationType.TEXT:
# Print annotation details
print(f"Title: {annotation.full_name}")
print(f"Contents: {annotation.contents}")
print(f"Annotation Rectangle: {annotation.rect}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment