Skip to content

Instantly share code, notes, and snippets.

@ivandeex
Last active September 9, 2022 12:22
Show Gist options
  • Save ivandeex/f903347745688da9bd5d to your computer and use it in GitHub Desktop.
Save ivandeex/f903347745688da9bd5d to your computer and use it in GitHub Desktop.
convert pdf to html or xml
# https://github.com/scraperwiki/pdfminer/blob/scraperwiki/tools/pdf2html.cgi
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
rsrcmgr = PDFResourceManager()
laparams = LAParams()
converter = HTMLConverter if format == 'html' else TextConverter
device = converter(rsrcmgr, out_file, codec='utf-8', laparams=laparams)
process_pdf(rsrcmgr, device, in_file, pagenos=[1,3,5], maxpages=9)
# https://github.com/scraperwiki/scraperwiki-python/blob/master/scraperwiki/utils.py
with contextlib.closing(tempfile.NamedTemporaryFile(mode='r', suffix='.xml')) as xmlin:
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (
pdf_filename, xmlin.name.rpartition('.')[0])
os.system(cmd + " >/dev/null 2>&1")
result = xmlin.read().decode('utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment