Last active
August 29, 2015 14:03
-
-
Save vitorio/8dc8853ee01ec651f263 to your computer and use it in GitHub Desktop.
Turn a Distance ePub file into something more web-appropriate: replace the XHTML doctype with HTML5; inline zeitgeist.css, template.css, and some CSS from your.distance.cc to move the paragraph numbers; add IDs; protect the email address with JS; and resolve and embiggen the dsn.tc short URLs. Deeded to the public domain. To the extent possible …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
__author__ = 'vitorio' | |
import bs4 | |
import argparse | |
import re | |
import requests | |
parser = argparse.ArgumentParser(description='Turn a Distance ePub file into something more web-appropriate') | |
parser.add_argument('epubfile', help='The Distance ePub file to read from') | |
args = parser.parse_args() | |
soup = bs4.BeautifulSoup(open(args.epubfile)) | |
# Replace XML doctype | |
for child in soup.contents: | |
if isinstance(child, bs4.element.Doctype): | |
html5doctype = bs4.Doctype('html') | |
child.replace_with(html5doctype) | |
break | |
# Update HTML tag | |
htmltag = soup.find('html') | |
del htmltag['xmlns'] | |
htmltag['lang'] = 'en' | |
# Add UTF-8 meta tag | |
headtag = soup.find('head') | |
newmeta = soup.new_tag('meta', charset='utf-8') | |
headtag.insert(0, '\n') | |
headtag.insert(1, newmeta) | |
# Replace template.css so file can stand alone, add additional styles | |
soup.find('link').decompose() | |
zeitgeistcss = ''' | |
/* zeitgeist.css */ | |
article,aside,details,figcaption,figure,footer,header,hgroup,nav,section,summary{display:block;}audio,canvas,video{display:inline-block;*display:inline;*zoom:1;}audio:not([controls]){display:none;}[hidden]{display:none;}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;}html,button,input,select,textarea{font-family:sans-serif;color:#222;}body{width:48em;margin:0 auto;font-size:1em;line-height:1.4;}::-moz-selection{background:#b3d4fc;text-shadow:none;}::selection{background:#b3d4fc;text-shadow:none;}a{color:#00e;}a:visited{color:#551a8b;}a:hover{color:#06e;}a:focus{outline:thin dotted;}a:hover,a:active{outline:0;}body{font:1em/1.625em "Lucida Grande","Lucida Sans Unicode",sans-serif;font-size-adjust:none;font-style:normal;font-variant:normal;font-weight:normal;background-color:#FFFEF0;}h1,h2,h3,h4,h5,h6{font-weight:normal;color:#333;font-family:Georgia,serif;}h1{font-size:2.125em;margin-bottom:.765em;}h2{font-size:1.9em;margin-bottom:.855em;}h3{font-size:1.7em;margin-bottom:.956em;}h4{font-size:1.4em;margin-bottom:1.161em;}h5,h6{font-size:1.313em;margin-bottom:1.238em;}a{color:#005AF2;text-decoration:none;}a:hover{text-decoration:underline;}abbr,acronym{border-bottom:1px dotted #000;}address{margin-top:1.625em;font-style:italic;}b,strong{font-weight:bold;}blockquote{padding:1em 1em 1.625em 1em;font-family:Georgia,serif;font-style:italic;}blockquote:before{content:"\\201C";font-size:3em;margin-left:-0.625em;font-family:Georgia,serif;color:#aaa;line-height:0;}blockquote>p{padding:0;margin:0;}caption{text-align:center;font-family:Georgia,serif;}del{color:#000;}dfn,em{font-style:italic;}dfn{font-weight:bold;}dl{margin:0 0 1.625em 0;}dl dt{font-weight:bold;}dl dd{margin-left:1.625em;}hr{display:block;height:1px;border:0;border-top:1px solid #ccc;margin-bottom:1.625em;padding:0;}ins{background:#ff9;color:#000;text-decoration:none;}mark{background:#ff0;color:#000;font-style:italic;font-weight:bold;}p{padding:0 0 .8125em 0;color:#111;font-weight:300;}p+p{text-indent:1.625em;}p.first:first-letter{float:left;font-family:Baskerville,"Palatino Linotype",serif;font-size:3em;font-weight:700;line-height:1em;margin-bottom:-0.2em;padding:.2em .1em 0 0;}p img{float:left;margin:.5em .8125em .8125em 0;padding:0;}p img.right{float:right;margin:.5em 0 .8125em .8125em;}pre,code,kbd,samp,tt{font-family:Monaco,"Lucida Mono","Liberation Mono","Courier New","Courier",monospace;_font-family:'courier new',monospace;font-size:1em;background:#eee;line-height:1.5;}pre,code{white-space:pre;white-space:pre-wrap;word-wrap:break-word;margin:1.625em 0;}q{quotes:none;}q:before,q:after{content:"";content:none;}small{font-size:85%;}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline;}sup{top:-0.5em;}sub{bottom:-0.25em;}tt{display:block;margin:1.625em 0;}ul,ol{list-style-position:outside;margin:0 0 1.625em 0;padding:0 0 0 40px;}li ul,li ol{margin:0 1.625em;}nav ul,nav ol{list-style:none;list-style-image:none;margin:0;padding:0;}img{border:0;vertical-align:middle;-ms-interpolation-mode:bicubic;}svg:not(:root){overflow:hidden;}figure{margin:0;}form{margin:0;}fieldset{border:0;margin:0;padding:0;}label{cursor:pointer;}legend{border:0;padding:0;white-space:normal;*margin-left:-7px;}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;}button,input{line-height:normal;}button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;*overflow:visible;}button[disabled],input[disabled]{cursor:default;}input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0;*width:13px;*height:13px;}input[type="search"]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box;}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none;}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0;}textarea{overflow:auto;vertical-align:top;resize:vertical;}table{border-collapse:collapse;border-spacing:0;margin-bottom:1.625em;}th{font-weight:bold;}tr,th,td{margin:0;padding:0 1.625em 0 1em;height:26px;}td{vertical-align:top;}tfoot{font-style:italic;}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0;}@media print{*{background:transparent!important;color:#000!important;box-shadow:none!important;text-shadow:none!important;}a,a:visited{text-decoration:underline;}a[href]:after{content:"(" attr(href) ")";}abbr[title]:after{content:"(" attr(title) ")";}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:"";}pre,blockquote{border:1px solid #999;page-break-inside:avoid;}thead{display:table-header-group;}tr,img{page-break-inside:avoid;}img{max-width:100%!important;}@page{margin:.5cm;}p,h2,h3{orphans:3;widows:3;}h2,h3{page-break-after:avoid;}} | |
''' | |
templatecss = ''' | |
/* template.css */ | |
@page { | |
margin: 0.5em; | |
} | |
tbody, thead, tfoot, tr, td, th { | |
border-style: inherit; | |
border-width: inherit; | |
border-color: inherit; | |
} | |
.leftFloat { | |
float: left; | |
} | |
.rightFloat { | |
float: right; | |
} | |
.page-break { | |
page-break-before: always; | |
} | |
.pgh_no { | |
font-size: 0.5em; | |
} | |
.attribution { | |
text-align: right; | |
font-style: italic; | |
} | |
h1, h2, h3.byline { | |
text-align: center; | |
} | |
''' | |
paragraphcss = ''' | |
/* your.distance.cc */ | |
.pgh_no { | |
display: inline; | |
float: right; | |
font-size: 0.5em; | |
margin-top: 1em; | |
text-decoration: none; | |
color: #613418; | |
/* margin-right: -2.25em; */ | |
margin-right: -3em; | |
} | |
''' | |
newcss = soup.new_tag('style') | |
newcss.string = zeitgeistcss | |
headtag.append(newcss) | |
headtag.append('\n\n') | |
newcss = soup.new_tag('style') | |
newcss.string = templatecss | |
headtag.append(newcss) | |
headtag.append('\n\n') | |
newcss = soup.new_tag('style') | |
newcss.string = paragraphcss | |
headtag.append(newcss) | |
headtag.append('\n\n') | |
# Rewrite the paragraph spans | |
for pspans in soup.find_all('span', attrs={'class': 'pgh_no'}): | |
pnum = int(pspans.string.strip('[]')) | |
pspans.string = u'¶ %d' % pnum | |
pspans.parent['id'] = 'p%d' % pnum | |
# Rewrite the header tags | |
for numh, hs in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])): | |
hs['id'] = 'h%d' % (numh + 1) | |
# Replace the byline email address with JavaScript | |
emaila = soup.find('h3', attrs={'class': 'byline'}).find('a', href=re.compile('mailto:')) | |
emailstr = emaila.string.split('@') | |
emailjs = ''' | |
var DISTANCE = DISTANCE || {}; | |
DISTANCE.email = '%s'; | |
DISTANCE.email += '@'; | |
DISTANCE.email += '%s'; | |
DISTANCE.bylineemail = document.getElementById('bylineemail'); | |
DISTANCE.bylineemail.setAttribute('href', 'mailto:' + DISTANCE.email); | |
DISTANCE.bylineemail.innerHTML = DISTANCE.email; | |
''' % (emailstr[0], emailstr[1]) | |
emaila['id'] = 'bylineemail' | |
del emaila['href'] | |
emaila.string = '' | |
newjs = soup.new_tag('script') | |
newjs.string = emailjs | |
soup.find('body').append(newjs) | |
soup.find('body').append('\n') | |
# Resolve and embiggen the dsn.tc short URLs | |
for dsntc in soup.find_all('a', href=re.compile('http://dsn\.tc/')): | |
r = requests.get(dsntc['href'], verify=False, allow_redirects=False) | |
dsntc['data-dsntc'] = dsntc['href'] | |
dsntc.string = r.headers['location'] | |
dsntc['href'] = r.headers['location'] | |
with open(args.epubfile + '.py.html', 'wb') as file: | |
file.write(str(soup)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment