Last active
October 27, 2023 02:45
-
-
Save documentprocessing/5d63d6624aa8edfa95614bd6075e669c to your computer and use it in GitHub Desktop.
Convert PDF to HTML and PDF to XML in Python using pdfminer.six Library. Check https://products.documentprocessing.com/conversion/python/pdfminer.six/ for more details.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import extract_text_to_fp function from pdfminer.high_level module | |
from pdfminer.high_level import extract_text_to_fp | |
# Import BytesIO class from io module | |
from io import BytesIO | |
# Specify the PDF file you want to convert to HTML | |
pdf_file = 'documentprocessing.pdf' | |
# Create an in-memory buffer to store the HTML output | |
output_buffer = BytesIO() | |
# Convert the PDF to HTML and write the HTML to the buffer | |
with open(pdf_file, 'rb') as pdf_file: | |
extract_text_to_fp(pdf_file, output_buffer, output_type='html') | |
# Retrieve the HTML content from the buffer | |
html_content = output_buffer.getvalue().decode('utf-8') | |
# Specify the HTML file where you want to save the content | |
html_output_file = 'output.html' | |
# Save the HTML content to the HTML file | |
with open(html_output_file, 'w', encoding='utf-8') as html_file: | |
html_file.write(html_content) | |
# Print a message indicating where the HTML file is saved | |
print(f'HTML content saved to {html_output_file}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import extract_text_to_fp function from pdfminer.high_level module | |
from pdfminer.high_level import extract_text_to_fp | |
# Import BytesIO class from io module | |
from io import BytesIO | |
# Open the PDF file for reading | |
with open('documentprocessing.pdf', 'rb') as pdf_file: | |
# Create a BytesIO object to store the XML content | |
xml_output = BytesIO() | |
# Convert the PDF to XML and write it to the BytesIO object | |
extract_text_to_fp(pdf_file, xml_output, output_type='xml') | |
# Seek to the beginning of the BytesIO object | |
xml_output.seek(0) | |
# Read the XML content from the BytesIO object | |
xml_content = xml_output.read() | |
# Save the XML content in a file | |
with open('output.xml', 'wb') as output_file: | |
output_file.write(xml_content) | |
# Close the BytesIO object | |
xml_output.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment