Last active
March 20, 2025 06:41
-
-
Save aspose-com-gists/8674c0f51022ba28947fb35f1c904b21 to your computer and use it in GitHub Desktop.
How to Parse PDFs in Python: A Step-by-Step Guide
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract text from all pages of a PDF document in Python | |
import aspose.pdf as ap | |
# Open PDF document | |
document = ap.Document("AddText.pdf") | |
# Create text absorber | |
text_absorber = ap.text.TextAbsorber() | |
# Call the accept method to process all pages | |
document.pages.accept(text_absorber) | |
# Retrieve the extracted text | |
extracted_text = text_absorber.text | |
# Define the file path | |
file_path = "extracted-text.txt" | |
# Open the file in write mode and write the extracted text | |
with open(file_path, "w", encoding="utf-8") as tw: | |
tw.write(extracted_text + "\n") # Write the extracted text with a newline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract images from a PDF in Python | |
import aspose.pdf as ap | |
# Open document | |
document = ap.Document("Sample.pdf") | |
# Extract a particular image (first image from the first page) | |
x_image = document.pages[1].resources.images[1] | |
# Define the output image path | |
output_image_path = "OutputImage.jpg" | |
# Save the extracted image | |
with open(output_image_path, "wb") as output_image: | |
output_image.write(x_image.to_stream().read()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract text from a specific page of a PDF document in Python | |
import aspose.pdf as ap | |
# Open PDF document | |
document = ap.Document("AddText.pdf") | |
# Create text absorber | |
text_absorber = ap.text.TextAbsorber() | |
# Call the accept method to process all pages | |
document.pages[1].accept(text_absorber) | |
# Retrieve the extracted text | |
extracted_text = text_absorber.text | |
# Define the file path | |
file_path = "extracted-text.txt" | |
# Open the file in write mode and write the extracted text | |
with open(file_path, "w", encoding="utf-8") as tw: | |
tw.write(extracted_text + "\n") # Write the extracted text with a newline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract tables from a PDF document in Python | |
import aspose.pdf as ap | |
# Load PDF file | |
document = pdf.Document("sample.pdf") | |
# Process all pages | |
for page in document.pages: | |
# Initialize TableAbsorber object | |
absorber = ap.text.TableAbsorber() | |
# Identify tables on the current page | |
absorber.visit(page) | |
# Loop through extracted tables | |
for table in absorber.table_list: | |
# Iterate through all the rows in the table | |
for row in table.row_list: | |
# Iterate through all the columns in the row | |
for cell in row.cell_list: | |
# Fetch the text fragments | |
text_fragment_collection = cell.text_fragments | |
# Iterate through the text fragments | |
for fragment in text_fragment_collection: | |
# Print the text | |
print(fragment.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract text from a multi-column PDF in Python | |
import io | |
import aspose.pdf as ap | |
# Open PDF document | |
document = ap.Document("multi-column-sample.pdf") | |
# Create TextFragmentAbsorber object to extract text | |
text_fragment_absorber = ap.text.TextFragmentAbsorber() | |
# Accept the absorber for the first page | |
document.pages.accept(text_fragment_absorber) | |
# Get the collection of extracted text fragments | |
text_fragment_collection = text_fragment_absorber.text_fragments | |
# Reduce font size by at least 70% to improve text extraction | |
for text_fragment in text_fragment_collection: | |
text_fragment.text_state.font_size *= 0.7 | |
# Save the modified document to an in-memory stream | |
source_stream = io.BytesIO() | |
document.save(source_stream) | |
# Reload the document from the memory stream | |
source_stream.seek(0) | |
dest_document = ap.Document(source_stream) | |
# Initialize TextAbsorber to extract the updated text | |
text_absorber = ap.text.TextAbsorber() | |
dest_document.pages.accept(text_absorber) | |
extracted_text = text_absorber.text | |
# Save the extracted text to a file | |
with open("ExtractColumnsText_out.txt", "w", encoding="utf-8") as file: | |
file.write(extracted_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract text from a specific region of a page in a PDF document using Python | |
import aspose.pdf as ap | |
# Open PDF document | |
document = ap.Document("sample.pdf") | |
# Create TextAbsorber object to extract text | |
absorber = ap.text.TextAbsorber() | |
absorber.text_search_options.limit_to_page_bounds = True | |
absorber.text_search_options.rectangle = ap.Rectangle(100, 200, 250, 350, True) | |
# Accept the absorber for the first page | |
document.pages[1].accept(absorber) | |
# Get the extracted text | |
extracted_text = absorber.text | |
# Define the file path | |
file_path = "extracted-text.txt" | |
# Open the file in write mode and write the extracted text | |
with open(file_path, "w", encoding="utf-8") as tw: | |
tw.write(extracted_text + "\n") # Write the extracted text with a newline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract text from a specific region of a page in a PDF document using Python | |
import aspose.pdf as ap | |
# Open PDF document | |
document = ap.Document("sample.pdf") | |
# Initialize TextAbsorber with text extraction options | |
text_absorber = ap.text.TextAbsorber() | |
# Set extraction options | |
extraction_options = ap.text.TextExtractionOptions(ap.text.TextExtractionOptions.TextFormattingMode.PURE) | |
extraction_options.scale_factor = 0.5 # Adjusts text recognition for better column detection | |
text_absorber.extraction_options = extraction_options | |
# Extract text from the specified page | |
document.pages.accept(text_absorber) | |
# Get extracted text | |
extracted_text = text_absorber.text | |
# Save extracted text to a file | |
with open("ExtractTextUsingScaleFactor_out.txt", "w", encoding="utf-8") as file: | |
file.write(extracted_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code example shows how to extract file information in Python | |
import aspose.pdf as ap | |
# Load the PDF document | |
document = ap.Document("Sample.pdf") | |
# Retrieve document information | |
doc_info = document.info | |
# Display document metadata | |
print(f"Author: {doc_info.author}") | |
print(f"Creation Date: {doc_info.creation_date}") | |
print(f"Keywords: {doc_info.keywords}") | |
print(f"Modify Date: {doc_info.mod_date}") | |
print(f"Subject: {doc_info.subject}") | |
print(f"Title: {doc_info.title}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aspose.pdf as ap | |
# Load the PDF document | |
document = ap.Document("annotations.pdf") | |
# Loop through all annotations on the first page | |
for annotation in document.pages[1].annotations: | |
if annotation.annotation_type == ap.annotations.AnnotationType.HIGHLIGHT: | |
# Print annotation details | |
print(f"Title: {annotation.full_name}") | |
print(f"Annotation Rectangle: {annotation.rect}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aspose.pdf as ap | |
# Load the PDF document | |
document = ap.Document("annotations.pdf") | |
# Loop through all annotations on the first page | |
for annotation in document.pages[1].annotations: | |
if annotation.annotation_type == ap.annotations.AnnotationType.LINE: | |
# Print annotation details | |
print(f"Annotation Rectangle: {annotation.rect}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aspose.pdf as ap | |
# Load the PDF document | |
document = ap.Document("annotations.pdf") | |
# Loop through all annotations on the first page | |
for annotation in document.pages[1].annotations: | |
if annotation.annotation_type == ap.annotations.AnnotationType.LINK: | |
# Print annotation details | |
print(f"Annotation Rectangle: {annotation.rect}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aspose.pdf as ap | |
# Load the PDF document | |
document = ap.Document("annotations.pdf") | |
# Loop through all annotations on the first page | |
for annotation in document.pages[1].annotations: | |
if annotation.annotation_type == ap.annotations.AnnotationType.TEXT: | |
# Print annotation details | |
print(f"Title: {annotation.full_name}") | |
print(f"Contents: {annotation.contents}") | |
print(f"Annotation Rectangle: {annotation.rect}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment