Skip to content

Instantly share code, notes, and snippets.

@esehara
Created July 23, 2011 05:29
Show Gist options
  • Save esehara/1101065 to your computer and use it in GitHub Desktop.
Save esehara/1101065 to your computer and use it in GitHub Desktop.
Aozora Author to Epub
# -*- coding:utf-8 -*-
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import os
import zipfile
"""
使い方:
青空文庫の著者カードを下のURLに記述してください。
そのあとにスクリプトを起動すると、その作者で公開されている作品のXHTMLファイルを
全てダウンロードして、epubにまとめてくれます。
注意:
デフォルトの作業デフェクトリは "./temp/"になっています。
もしスクリプトが置いてあるフォルダに、
フォルダが存在しない場合は、"./temp/"を作成してください。
"""
author_url = 'http://www.aozora.gr.jp/index_pages/person281.html'
class Aozora_to_epub(object):
def __init__(self,url):
self.url = url
self.epub = Epub_Make()
self.author = Aozora_author_url()
def get_content(self):
self.author.get_url(self.url)
self.epub.create_init(self.author.analyzer(self.epub.workspace))
self.author.get_html()
def make_epub(self):
self.epub.make_opf(self.url)
self.epub.make_ncx(self.url)
self.epub.make_zip()
class Aozora_author_url(object):
def __init__(self):
self.opener = urllib2.build_opener()
def get_url(self,url):
html = self.opener.open(url).read()
self.mainsoup = BeautifulSoup(html)
def analyzer(self,workspace):
self.worklist = []
subsoup = self.mainsoup.find('ol')
self.author=self.mainsoup.find('font').text
for item in subsoup.findAll('a'):
print item.text
#print item['href']
self.worklist.append({'title':item.text,'url':'http://www.aozora.gr.jp/index_pages/' + item['href']})
self.works_get = Aozora_works_get(workspace + self.author)
return self.author
def get_html(self):
for item in self.worklist:
self.works_get.get_url(item['url'])
self.works_get.analyzer()
self.works_get.download()
class Aozora_works_get(object):
def __init__(self,dir='./temp/'):
self.opener = urllib2.build_opener()
self.re_html = re.compile('.*html')
self.save_dir = dir
self.encode = System_charset()
def get_url(self,url):
html = self.opener.open(url).read()
self.mainsoup = BeautifulSoup(html)
url = url.split('/')
url.pop()
self.base_url = '/'.join(url)
def analyzer(self):
try:
subsoup = self.mainsoup.findAll('table')[5]
linksoup = subsoup.findAll('a')
self.author_data = self.mainsoup.find('font').text
for itemsoup in linksoup:
if self.re_html.match(itemsoup['href']) != None:
#print u"[DEBUG]Download XHTML => " + itemsoup['href']
self.download_que = itemsoup['href']
return
except:print '[DEBUG] Sorry, This File is Problem....'
self.download_que = False
def download(self):
if self.download_que:
#print u"[DEBUG]self.base_url => " + self.base_url
#print u"[DEBUG]self.download_que =>" + self.download_que
print u"[DEBUG]get url => " + self.base_url + '/' + self.download_que
urllib.urlretrieve(self.base_url + '/' + self.download_que,self.save_dir + u'/text/' + self.author_data + u".xhtml")
self.encode.work(self.save_dir + '/text/' + self.author_data + ".xhtml")
class System_charset(object):
"""
ここのURLを参考:http://www.aozora.gr.jp/index_pages/person281.html
"""
def guess_charset(self,data):
f = lambda d, enc: d.decode(enc) and enc
try: return f(data, 'utf-8')
except: pass
try: return f(data, 'shift-jis')
except: pass
try: return f(data, 'euc-jp')
except: pass
try: return f(data, 'iso2022-jp')
except: pass
return None
def conv(self,data):
charset = self.guess_charset(data)
u = data.decode(charset)
return u.encode('utf-8')
def work(self,data):
print "[DEBUG] Convert data UTF-8 Start :" + data
convert_data = file(data,'rU')
work_data = convert_data.read()
work_data = re.sub('Shift_JIS','UTF-8',work_data)
convert_data.close()
try:
work_data = self.conv(work_data)
except:
print "[DEBUG]",data,"-> skip"
convert_data = file(data,'w')
convert_data.write(work_data)
convert_data.close()
print "[DEBUG]",data,'-> Convert UTF-8'
class Epub_Make(object):
def __init__(self,workspace='./temp/'):
self.workspace=workspace
self.author = 'hogehoge'
def create_init(self,author):
workdir = self.workspace + author + '/'
self.author = author
os.mkdir(workdir)
os.mkdir(workdir + 'text')
os.mkdir(workdir + 'META-INF')
minetype = open(workdir + 'mimetype','w')
minetype.write('application/epub+zip\n')
minetype.close()
container_xml = open(workdir + 'META-INF/container.xml','w')
make_xml = """<?xml version='1.0' encoding="UTF-8"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile full-path="content.opf" media-type="application/oebps-package+xml" />
</rootfiles>
</container>
"""
container_xml.write(make_xml)
container_xml.close()
self.workdir = workdir
def make_opf(self,url):
xml_header = u"""<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>%s 全部</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:language>ja</dc:language>
<dc:publisher>Aozora Bunko to Epub (青空文庫) </dc:publisher>
<dc:identifier id="BookId">%s</dc:identifier>
</metadata>
""" % (self.author,self.author,url)
xml_manifest_header = u"""
<manifest>
<item id="ncx" href="toc.ncx" media-type="text/xml" />
"""
xml_manifest_body = u""
filelist = os.listdir(self.workdir + '/text/')
for number,file in enumerate(filelist):
xml_manifest_body += u"<item id='file%d' href='text/%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml')
xml_manifest_body += u'</manifest>'
xml_fooder = u"""
<spine toc="ncx">
<itemref idref="file1" />
</spine>
</package>
"""
print "[DEBUG] ---- Output OPF File ----"
opffile = open(self.workdir + 'content.opf','w')
print xml_header
opffile.write(xml_header.encode('utf-8'))
print xml_manifest_header
opffile.write(xml_manifest_header.encode('utf-8'))
print xml_manifest_body
opffile.write(xml_manifest_body.encode('utf-8'))
print xml_fooder
opffile.write(xml_fooder.encode('utf-8'))
opffile.close()
self.filelist = filelist
def make_ncx(self,url):
xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<mata name='dtb:uid' content="%s" />
<meta name='dtb:depth' content="1" />
<meta name='dtb:totalPageCount' content="0" />
<meta name='dtb:maxPageNumber' content="0" />
</head>
<docTitle>
<text>%s 全部</text>
</docTitle>
<docAuthor>
<text>%s</text>
</docAuthor>
""" % (url,self.author,self.author)
xml_navMap = u"<navMap>"
for number,file in enumerate(self.filelist):
xml_navMap += u"""<navPoint id='file%d' playOrder='%d'>
<navLabel>
<text>%s</text>
</navLabel>
<content src="text/%s" />
</navPoint>
""" % (number + 1,number + 1,file.split('.')[0],str(number + 1)+'.xhtml')
os.rename(self.workdir + '/text/' + file, self.workdir + '/text/' + str(number + 1) + '.xhtml')
xml_navMap += u"</navMap>"
xml_navMap += u"</ncx>"
ncxfile = open(self.workdir + 'toc.ncx','w')
print "[DEBUG] ---- Output Ncx File ----"
print xml_header
ncxfile.write(xml_header.encode('utf-8'))
print xml_navMap
ncxfile.write(xml_navMap.encode('utf-8'))
ncxfile.close()
def make_zip(self):
epub_zip = zipfile.ZipFile(re.sub(' ','',self.author) + '.epub','w',zipfile.ZIP_DEFLATED)
filelist = os.listdir(self.workdir)
for file in filelist:
epub_zip.write(self.workdir + file,file)
filelist = os.listdir(self.workdir + 'META-INF')
for file in filelist:
epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file)
filelist = os.listdir(self.workdir + 'text')
for file in filelist:
epub_zip.write(self.workdir + 'text/' + file,'./text/' + file)
epub_zip.close()
def test():
global author_url
test_def = Aozora_to_epub(author_url)
test_def.get_content()
test_def.make_epub()
if __name__ == '__main__':test()
@headbaker
Copy link

Hi! Is this still working?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment