aspose-com-gists · March 20, 2025 06:41
diff --git a/parse-pdf-in-python.md b/parse-pdf-in-python.md
diff --git a/parse-pdf-in-python_extract_all_text.py b/parse-pdf-in-python_extract_all_text.py
 # This code example shows how to extract text from all pages of a PDF document in Python
 import aspose.pdf as ap

 # Open PDF document
 document = ap.Document("AddText.pdf")

 # Create text absorber
 text_absorber = ap.text.TextAbsorber()

 # Call the accept method to process all pages
 document.pages.accept(text_absorber)

 # Retrieve the extracted text
 extracted_text = text_absorber.text

 # Define the file path
 file_path = "extracted-text.txt"

 # Open the file in write mode and write the extracted text
 with open(file_path, "w", encoding="utf-8") as tw:
    tw.write(extracted_text + "\n")  # Write the extracted text with a newline
diff --git a/parse-pdf-in-python_extract_image.py b/parse-pdf-in-python_extract_image.py
 # This code example shows how to extract images from a PDF in Python
 import aspose.pdf as ap

 # Open document
 document = ap.Document("Sample.pdf")

 # Extract a particular image (first image from the first page)
 x_image = document.pages[1].resources.images[1]

 # Define the output image path
 output_image_path = "OutputImage.jpg"

 # Save the extracted image
 with open(output_image_path, "wb") as output_image:
    output_image.write(x_image.to_stream().read())
diff --git a/parse-pdf-in-python_extract_specific_page_text.py b/parse-pdf-in-python_extract_specific_page_text.py
 # This code example shows how to extract text from a specific page of a PDF document in Python
 import aspose.pdf as ap

 # Open PDF document
 document = ap.Document("AddText.pdf")

 # Create text absorber
 text_absorber = ap.text.TextAbsorber()

 # Call the accept method to process all pages
 document.pages[1].accept(text_absorber)

 # Retrieve the extracted text
 extracted_text = text_absorber.text

 # Define the file path
 file_path = "extracted-text.txt"

 # Open the file in write mode and write the extracted text
 with open(file_path, "w", encoding="utf-8") as tw:
    tw.write(extracted_text + "\n")  # Write the extracted text with a newline
diff --git a/parse-pdf-in-python_extract_table.py b/parse-pdf-in-python_extract_table.py
 # This code example shows how to extract tables from a PDF document in Python
 import aspose.pdf as ap

 # Load PDF file
 document = pdf.Document("sample.pdf")

 # Process all pages
 for page in document.pages:
    # Initialize TableAbsorber object
    absorber = ap.text.TableAbsorber()
    # Identify tables on the current page
    absorber.visit(page)
    # Loop through extracted tables
    for table in absorber.table_list:
        # Iterate through all the rows in the table
        for row in table.row_list:
            # Iterate through all the columns in the row
            for cell in row.cell_list:
                # Fetch the text fragments
                text_fragment_collection = cell.text_fragments
                # Iterate through the text fragments
                for fragment in text_fragment_collection:
                    # Print the text
                    print(fragment.text)
diff --git a/parse-pdf-in-python_extract_text_from_multi_column.py b/parse-pdf-in-python_extract_text_from_multi_column.py
 # This code example shows how to extract text from a multi-column PDF in Python
 import io
 import aspose.pdf as ap

 # Open PDF document
 document = ap.Document("multi-column-sample.pdf")

 # Create TextFragmentAbsorber object to extract text
 text_fragment_absorber = ap.text.TextFragmentAbsorber()

 # Accept the absorber for the first page
 document.pages.accept(text_fragment_absorber)

 # Get the collection of extracted text fragments
 text_fragment_collection = text_fragment_absorber.text_fragments

 # Reduce font size by at least 70% to improve text extraction
 for text_fragment in text_fragment_collection:
    text_fragment.text_state.font_size *= 0.7

 # Save the modified document to an in-memory stream
 source_stream = io.BytesIO()
 document.save(source_stream)

 # Reload the document from the memory stream
 source_stream.seek(0)
 dest_document = ap.Document(source_stream)

 # Initialize TextAbsorber to extract the updated text
 text_absorber = ap.text.TextAbsorber()
 dest_document.pages.accept(text_absorber)
 extracted_text = text_absorber.text

 # Save the extracted text to a file
 with open("ExtractColumnsText_out.txt", "w", encoding="utf-8") as file:
    file.write(extracted_text)
diff --git a/parse-pdf-in-python_extract_text_from_page_region.py b/parse-pdf-in-python_extract_text_from_page_region.py
 # This code example shows how to extract text from a specific region of a page in a PDF document using Python
 import aspose.pdf as ap

 # Open PDF document
 document = ap.Document("sample.pdf")

 # Create TextAbsorber object to extract text
 absorber = ap.text.TextAbsorber()
 absorber.text_search_options.limit_to_page_bounds = True
 absorber.text_search_options.rectangle = ap.Rectangle(100, 200, 250, 350, True)

 # Accept the absorber for the first page
 document.pages[1].accept(absorber)

 # Get the extracted text
 extracted_text = absorber.text

 # Define the file path
 file_path = "extracted-text.txt"

 # Open the file in write mode and write the extracted text
 with open(file_path, "w", encoding="utf-8") as tw:
    tw.write(extracted_text + "\n")  # Write the extracted text with a newline
diff --git a/parse-pdf-in-python_extract_text_with_scale_factor.py b/parse-pdf-in-python_extract_text_with_scale_factor.py
 # This code example shows how to extract text from a specific region of a page in a PDF document using Python
 import aspose.pdf as ap

 # Open PDF document
 document = ap.Document("sample.pdf")

 # Initialize TextAbsorber with text extraction options
 text_absorber = ap.text.TextAbsorber()

 # Set extraction options
 extraction_options = ap.text.TextExtractionOptions(ap.text.TextExtractionOptions.TextFormattingMode.PURE)
 extraction_options.scale_factor = 0.5  # Adjusts text recognition for better column detection
 text_absorber.extraction_options = extraction_options

 # Extract text from the specified page
 document.pages.accept(text_absorber)

 # Get extracted text
 extracted_text = text_absorber.text

 # Save extracted text to a file
 with open("ExtractTextUsingScaleFactor_out.txt", "w", encoding="utf-8") as file:
    file.write(extracted_text)
diff --git a/parse-pdf-in-python_get_metadata.py b/parse-pdf-in-python_get_metadata.py
 # This code example shows how to extract file information in Python
 import aspose.pdf as ap

 # Load the PDF document
 document = ap.Document("Sample.pdf")

 # Retrieve document information
 doc_info = document.info

 # Display document metadata
 print(f"Author: {doc_info.author}")
 print(f"Creation Date: {doc_info.creation_date}")
 print(f"Keywords: {doc_info.keywords}")
 print(f"Modify Date: {doc_info.mod_date}")
 print(f"Subject: {doc_info.subject}")
 print(f"Title: {doc_info.title}")
diff --git a/parse-pdf-in-python_highlight_annotation.py b/parse-pdf-in-python_highlight_annotation.py
 import aspose.pdf as ap

 # Load the PDF document
 document = ap.Document("annotations.pdf")

 # Loop through all annotations on the first page
 for annotation in document.pages[1].annotations:
    if annotation.annotation_type == ap.annotations.AnnotationType.HIGHLIGHT:
        # Print annotation details
        print(f"Title: {annotation.full_name}")
        print(f"Annotation Rectangle: {annotation.rect}")
diff --git a/parse-pdf-in-python_line_annotation.py b/parse-pdf-in-python_line_annotation.py
 import aspose.pdf as ap

 # Load the PDF document
 document = ap.Document("annotations.pdf")

 # Loop through all annotations on the first page
 for annotation in document.pages[1].annotations:
    if annotation.annotation_type == ap.annotations.AnnotationType.LINE:
        # Print annotation details
        print(f"Annotation Rectangle: {annotation.rect}")
diff --git a/parse-pdf-in-python_link_annotation.py b/parse-pdf-in-python_link_annotation.py
 import aspose.pdf as ap

 # Load the PDF document
 document = ap.Document("annotations.pdf")

 # Loop through all annotations on the first page
 for annotation in document.pages[1].annotations:
    if annotation.annotation_type == ap.annotations.AnnotationType.LINK:
        # Print annotation details
        print(f"Annotation Rectangle: {annotation.rect}")
diff --git a/parse-pdf-in-python_text_annotation.py b/parse-pdf-in-python_text_annotation.py
 import aspose.pdf as ap

 # Load the PDF document
 document = ap.Document("annotations.pdf")

 # Loop through all annotations on the first page
 for annotation in document.pages[1].annotations:
    if annotation.annotation_type == ap.annotations.AnnotationType.TEXT:
        # Print annotation details
        print(f"Title: {annotation.full_name}")
        print(f"Contents: {annotation.contents}")
        print(f"Annotation Rectangle: {annotation.rect}")
	# This code example shows how to extract text from all pages of a PDF document in Python
	import aspose.pdf as ap

	# Open PDF document
	document = ap.Document("AddText.pdf")

	# Create text absorber
	text_absorber = ap.text.TextAbsorber()

	# Call the accept method to process all pages
	document.pages.accept(text_absorber)

	# Retrieve the extracted text
	extracted_text = text_absorber.text

	# Define the file path
	file_path = "extracted-text.txt"

	# Open the file in write mode and write the extracted text
	with open(file_path, "w", encoding="utf-8") as tw:
	tw.write(extracted_text + "\n") # Write the extracted text with a newline
	# This code example shows how to extract images from a PDF in Python
	import aspose.pdf as ap

	# Open document
	document = ap.Document("Sample.pdf")

	# Extract a particular image (first image from the first page)
	x_image = document.pages[1].resources.images[1]

	# Define the output image path
	output_image_path = "OutputImage.jpg"

	# Save the extracted image
	with open(output_image_path, "wb") as output_image:
	output_image.write(x_image.to_stream().read())
	# This code example shows how to extract tables from a PDF document in Python
	import aspose.pdf as ap

	# Load PDF file
	document = pdf.Document("sample.pdf")

	# Process all pages
	for page in document.pages:
	# Initialize TableAbsorber object
	absorber = ap.text.TableAbsorber()
	# Identify tables on the current page
	absorber.visit(page)
	# Loop through extracted tables
	for table in absorber.table_list:
	# Iterate through all the rows in the table
	for row in table.row_list:
	# Iterate through all the columns in the row
	for cell in row.cell_list:
	# Fetch the text fragments
	text_fragment_collection = cell.text_fragments
	# Iterate through the text fragments
	for fragment in text_fragment_collection:
	# Print the text
	print(fragment.text)
	# This code example shows how to extract text from a multi-column PDF in Python
	import io
	import aspose.pdf as ap

	# Open PDF document
	document = ap.Document("multi-column-sample.pdf")

	# Create TextFragmentAbsorber object to extract text
	text_fragment_absorber = ap.text.TextFragmentAbsorber()

	# Accept the absorber for the first page
	document.pages.accept(text_fragment_absorber)

	# Get the collection of extracted text fragments
	text_fragment_collection = text_fragment_absorber.text_fragments

	# Reduce font size by at least 70% to improve text extraction
	for text_fragment in text_fragment_collection:
	text_fragment.text_state.font_size *= 0.7

	# Save the modified document to an in-memory stream
	source_stream = io.BytesIO()
	document.save(source_stream)

	# Reload the document from the memory stream
	source_stream.seek(0)
	dest_document = ap.Document(source_stream)

	# Initialize TextAbsorber to extract the updated text
	text_absorber = ap.text.TextAbsorber()
	dest_document.pages.accept(text_absorber)
	extracted_text = text_absorber.text

	# Save the extracted text to a file
	with open("ExtractColumnsText_out.txt", "w", encoding="utf-8") as file:
	file.write(extracted_text)
	# This code example shows how to extract file information in Python
	import aspose.pdf as ap

	# Load the PDF document
	document = ap.Document("Sample.pdf")

	# Retrieve document information
	doc_info = document.info

	# Display document metadata
	print(f"Author: {doc_info.author}")
	print(f"Creation Date: {doc_info.creation_date}")
	print(f"Keywords: {doc_info.keywords}")
	print(f"Modify Date: {doc_info.mod_date}")
	print(f"Subject: {doc_info.subject}")
	print(f"Title: {doc_info.title}")
	import aspose.pdf as ap

	# Load the PDF document
	document = ap.Document("annotations.pdf")

	# Loop through all annotations on the first page
	for annotation in document.pages[1].annotations:
	if annotation.annotation_type == ap.annotations.AnnotationType.HIGHLIGHT:
	# Print annotation details
	print(f"Title: {annotation.full_name}")
	print(f"Annotation Rectangle: {annotation.rect}")