Created
May 16, 2014 09:53
-
-
Save StevenMaude/88def892b0cbfa8ae818 to your computer and use it in GitHub Desktop.
pdf_to_html_preview messy hack to work on Windows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#forked from: Julian_Todd / PDF to HTML (https://scraperwiki.com/views/pdf-to-html-preview-1/) | |
#input url goes to line | |
import sys | |
import urllib, urllib2, urlparse | |
import lxml.etree, lxml.html | |
import re, os | |
def Pageblock(page, index): | |
''' | |
Print each page of the PDF in turn, outputting the contents as HTML. | |
''' | |
result = [ ] | |
assert page.tag == 'page' | |
height = int(page.attrib.get('height')) | |
width = int(page.attrib.get('width')) | |
number = page.attrib.get('number') | |
assert page.attrib.get('position') == "absolute" | |
result.append('<p>Page %s index %d height=%d width=%d</p>' % (number, index, height, width)) | |
result.append('<div class="page" style="height:%dpx; width:%dpx">' % (height, width)) | |
for v in page: | |
if v.tag == 'fontspec': | |
continue | |
assert v.tag == 'text' | |
text = re.match('(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(v)).group(1) | |
top = int(v.attrib.get('top')) | |
left = int(v.attrib.get('left')) | |
width = int(v.attrib.get('width')) | |
height = int(v.attrib.get('height')) | |
fontid = v.attrib.get('font') | |
style = 'top:%dpx; left:%dpx; height:%dpx; width:%dpx' % (top, left, height, width) | |
result.append(' <div class="text fontspec-%s" style="%s">%s</div>' % (fontid, style, text)) | |
result.append('</div>') | |
return '\n'.join(result) | |
# pdftoxml based on utils.py in scraperwiki-python | |
# David Jones, ScraperWiki Limited | |
# Thomas Levine, ScraperWiki Limited | |
# BSD Licence; see https://github.com/scraperwiki/scraperwiki-python | |
def pdftoxml(pdfdata, options): | |
"""converts pdf file to xml file""" | |
# lots of hacky Windows fixes c.f. original | |
with open('input.pdf', 'wb') as f: | |
f.write(pdfdata) | |
cmd = 'pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes ' | |
if options: | |
cmd += options | |
cmd += 'input.pdf output.xml' | |
cmd = cmd + " > NUL 2>&1" # can't turn off output, so throw away even stderr yeuch | |
os.system(cmd) | |
with open('output.xml', 'r') as f: | |
return f.read() | |
def Main(pdfurl, hidden): | |
''' | |
Take the URL of a PDF, and use scraperwiki.pdftoxml and lxml to output the contents | |
as a styled HTML div. | |
''' | |
pdfdata = urllib2.urlopen(pdfurl).read() | |
options = '' | |
if hidden == 'hidden': | |
options='-hidden ' # | |
# TODO: readd this if implemented in scraperwiki-python | |
# see https://github.com/scraperwiki/scraperwiki-python/issues/48 | |
# pdfxml = scraperwiki.pdftoxml(pdfdata, options) | |
pdfxml = pdftoxml(pdfdata, options) | |
try: | |
root = lxml.etree.fromstring(pdfxml) | |
except lxml.etree.XMLSyntaxError, e: | |
print str(e), str(type(e)).replace("<", "<") | |
print pdfurl | |
print pdfxml.replace("<", "<") | |
root = [] | |
global styles | |
fontspecs = { } | |
# Get the PDF's internal styles: we'll use these to style the divs containing the PDF. | |
for fontspec in (root is not None and root.xpath('page/fontspec')): | |
id = fontspec.attrib.get('id') | |
fontdesc = {'size':int(fontspec.attrib.get('size')), 'family':fontspec.attrib.get('family'), 'color':fontspec.attrib.get('color')} | |
fontspecs[id] = fontdesc | |
styles['div.fontspec-%s' % id] = 'color:%s;font-family:%s;font-size:%dpx' % (fontdesc['color'], fontdesc['family'], fontdesc['size']) | |
# Output the view, with instructions for the user. | |
print '<html dir="ltr" lang="en">' | |
print '<head>' | |
print ' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' | |
print ' <title>PDF to XML text positioning</title>' | |
print ' <style type="text/css" media="screen">%s</style>' % "\n".join([ "%s { %s }" % (k, v) for k, v in styles.items() ]) | |
print ' <script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>' | |
print ' <script>%s</script>' % jscript | |
print '</head>' | |
print '<div class="info" id="info1"><text block></div>' | |
print '<div class="info" id="info2"><position></div>' | |
print '<div class="heading">' | |
print '<h2>Graphical preview of scraperwiki.pdftoxml(pdfdata)</h2>' | |
print '<p>Click on a text line to see its coordinates and any other text that shares the same column or row.' | |
print ' Useful for discovering what coordinates to use when extracting rows from tables in a document.</p>' | |
print '<p>To do: track the coordinates of the mouse and cross reference with <a href="/cropper">cropper</a> technology.</p>' | |
print '<p class="href"><a href="%s">%s</a></p>'% (pdfurl, pdfurl) | |
print '<form id="newpdfdoclink">' | |
print '<label for="url">PDF link</label>' | |
print ' <input type="text" name="url" id="url" value="%s" title="paste in url of document">' % pdfurl | |
if hidden == 1: | |
checked="checked " | |
else: | |
checked="" | |
print '<br /><label for="hidden">Force hidden text extraction</label>' | |
print ' <input type="checkbox" name="hidden" id="hidden" value="1" %stitle="force hidden text extraction">' % checked | |
print '<br /> <input type="submit" value="Go">' | |
print '</form>' | |
ttx = re.sub('<', '<', pdfxml) | |
ttx = re.sub('\n', '\r\n', ttx) | |
# Does this truncate in case of large PDF? | |
print '<textarea class="pdfprev">%s</textarea>' % ttx[:5000] | |
print '</div>' | |
print '<p>There are %d pages</p>' % len(root) | |
# Print each page of the PDF. | |
for index, page in enumerate(root): | |
print Pageblock(page, index) | |
# Global styles for the divs containing the PDF. | |
styles = { | |
"div#info1": "position:fixed; white-space:pre; background-color:#ffd; border: thin red solid; z-index: 50; top:0px;", | |
"div#info2": "position:fixed; white-space:pre; background-color:#ffd; border: thin red solid; z-index: 50; top:20px;", | |
"div.heading": "padding-left:150px;", | |
"p.href": "font-size:60%", | |
"div.page": "background-color:#fff; border:thin black solid; position:relative; margin:2em;", | |
"div.text": "position:absolute; white-space:pre; background-color:#eee;", | |
"textarea.pdfprev":"white-space:pre; height:150px; width:80%", | |
"div.text:hover": "background-color:#faa; cursor:pointer", | |
"div.linev": "background-color:#fcc;", | |
"div.lineh": "background-color:#fce;", | |
} | |
# Global JavaScript allowing the user to click on an area of the PDF div, and see the | |
# underlying PDF source. | |
jscript = """ | |
var rfontspec = new RegExp('fontspec-(\\\\w+)'); | |
$(function() | |
{ | |
$('div.text').click(function () | |
{ | |
var top = parseInt($(this).css('top')); | |
var left = parseInt($(this).css('left')); | |
var width = parseInt($(this).css('width')); | |
var height = parseInt($(this).css('height')); | |
var clas = $(this).attr('class'); | |
var lfont = rfontspec.exec(clas); | |
var font = (lfont ? lfont[1] : clas); | |
$('div#info1').text($(this).html()); | |
$('div#info2').text('top='+top + ' bottom='+(top+height)+ ' left='+left + ' right='+(left+width) + ' font='+font); | |
$('div.text').each(function() | |
{ | |
var lleft = parseInt($(this).css('left')); | |
if (lleft == left) | |
$(this).addClass('linev'); | |
else | |
$(this).removeClass('linev'); | |
var ltop = parseInt($(this).css('top')); | |
if (ltop == top) | |
$(this).addClass('lineh'); | |
else | |
$(this).removeClass('lineh'); | |
}); | |
}); | |
}); | |
""" | |
# Check for a PDF URL and hidden-checkbox entered by the user: if none, use our default values: | |
# urlquery = os.getenv('URLQUERY') | |
#if urlquery: | |
# querydata = urlparse.parse_qsl(urlquery); | |
# for pair in querydata: | |
# if pair[0] == "url": | |
# pdfurl = urllib.unquote(pair[1]) | |
# if pair[0] == "hidden": | |
# hidden = 1 | |
if __name__ == '__main__': | |
try: | |
pdfurl = sys.argv[1] | |
except IndexError: | |
print "Enter a valid URL. For example:" | |
print "http://soswy.state.wy.us/Elections/Docs/2004/04Results/04General/AL_PbP_Candidate_Summary.pdf" | |
sys.exit(1) | |
try: | |
hidden = sys.argv[2] | |
except IndexError: | |
hidden = None | |
Main(pdfurl, hidden) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Windows installation
Python
Install the latest Python 2 (this is 2.7.6 at the moment). This was all tested with the x86 release, so I'd recommend that (not x86-64).
Make a directory to work in, e.g.
pdf_to_html
Open command prompt in this work folder (quick way, in Explorer, right click the folder while holding shift, Open Command Prompt).
Try running Python by entering
python
into the command prompt. If you get'python' is not recognized...
, execute this command:set PATH=%PATH%;"C:\Python27"
(That's the default for Python in Windows; otherwise replace the Python directory with wherever your Python lives.)If you need this change, it only persists in this command prompt; there are plenty of guides on how to set this permanently using Windows Control Panel.
At this stage, you should be able to execute the command
python
at the prompt, and you'll be in a Python interpreter. Exit this by enteringexit()
to return to the command prompt.lxml
Download
lxml
from https://pypi.python.org/pypi/lxml/3.3.5Get
lxml-3.3.5.win32-py2.7.exe
You can install
lxml
. Alternatively, you can just open the archive up with e.g. 7-Zip (it's an executable zip file) and extract thelxml
directory to your workpdf_to_html
folderStart the Python interpreter again in the same command prompt with
python
; enterimport lxml
. If all is well, you should see nothing, it should execute silently (without anyImportError
). Great! Exit back to the command prompt again withexit()
.pdftohtml.exe
We need a recent version of the pdftohtml program that does the conversion from PDF to XML. An easy way to get this on Windows is via Calibre Portable. Install it anywhere; you just need a few files from it. From where you installed it, from the folder containing
calibre.exe
, copypdftohtml.exe
into yourpdf_to_html
work folder. Next, go into the DLLs folder in the Calibre install, copyfreetype.dll
,jpeg.dll
,libpng12.dll
,zlib1.dll
to yourpdf_to_html
directory.This
pdf_to_html.py
scriptDownload the
pdf_to_html.py
from this gist and save aspdf_to_html.py
; this is hacked to work on Windows without needing thescraperwiki
package. Move this to your workpdf_to_html
folder.Usage
Right, after all that hard work, you should have
pdf_to_html.py
,pdftohtml.exe
and the DLLs from Calibre with thelxml
folder all in yourpdf_to_html
folder.Execute in the command prompt you've been using this whole time with
python pdf_to_html.py http://www.somepdf.url > output.html
If you want to use the hidden text option,
python pdf_to_html.py http://www.somepdf.url hidden > output.html
View
output.html
in a browser to see the output.