Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/5d63d6624aa8edfa95614bd6075e669c to your computer and use it in GitHub Desktop.
Save documentprocessing/5d63d6624aa8edfa95614bd6075e669c to your computer and use it in GitHub Desktop.
Convert PDF to HTML and PDF to XML in Python using pdfminer.six Library. Check https://products.documentprocessing.com/conversion/python/pdfminer.six/ for more details.
# Import extract_text_to_fp function from pdfminer.high_level module
from pdfminer.high_level import extract_text_to_fp
# Import BytesIO class from io module
from io import BytesIO
# Specify the PDF file you want to convert to HTML
pdf_file = 'documentprocessing.pdf'
# Create an in-memory buffer to store the HTML output
output_buffer = BytesIO()
# Convert the PDF to HTML and write the HTML to the buffer
with open(pdf_file, 'rb') as pdf_file:
extract_text_to_fp(pdf_file, output_buffer, output_type='html')
# Retrieve the HTML content from the buffer
html_content = output_buffer.getvalue().decode('utf-8')
# Specify the HTML file where you want to save the content
html_output_file = 'output.html'
# Save the HTML content to the HTML file
with open(html_output_file, 'w', encoding='utf-8') as html_file:
html_file.write(html_content)
# Print a message indicating where the HTML file is saved
print(f'HTML content saved to {html_output_file}')
# Import extract_text_to_fp function from pdfminer.high_level module
from pdfminer.high_level import extract_text_to_fp
# Import BytesIO class from io module
from io import BytesIO
# Open the PDF file for reading
with open('documentprocessing.pdf', 'rb') as pdf_file:
# Create a BytesIO object to store the XML content
xml_output = BytesIO()
# Convert the PDF to XML and write it to the BytesIO object
extract_text_to_fp(pdf_file, xml_output, output_type='xml')
# Seek to the beginning of the BytesIO object
xml_output.seek(0)
# Read the XML content from the BytesIO object
xml_content = xml_output.read()
# Save the XML content in a file
with open('output.xml', 'wb') as output_file:
output_file.write(xml_content)
# Close the BytesIO object
xml_output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment