Created
July 23, 2011 05:29
-
-
Save esehara/1101065 to your computer and use it in GitHub Desktop.
Aozora Author to Epub
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import urllib | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
import os | |
import zipfile | |
""" | |
使い方: | |
青空文庫の著者カードを下のURLに記述してください。 | |
そのあとにスクリプトを起動すると、その作者で公開されている作品のXHTMLファイルを | |
全てダウンロードして、epubにまとめてくれます。 | |
注意: | |
デフォルトの作業デフェクトリは "./temp/"になっています。 | |
もしスクリプトが置いてあるフォルダに、 | |
フォルダが存在しない場合は、"./temp/"を作成してください。 | |
""" | |
author_url = 'http://www.aozora.gr.jp/index_pages/person281.html' | |
class Aozora_to_epub(object): | |
def __init__(self,url): | |
self.url = url | |
self.epub = Epub_Make() | |
self.author = Aozora_author_url() | |
def get_content(self): | |
self.author.get_url(self.url) | |
self.epub.create_init(self.author.analyzer(self.epub.workspace)) | |
self.author.get_html() | |
def make_epub(self): | |
self.epub.make_opf(self.url) | |
self.epub.make_ncx(self.url) | |
self.epub.make_zip() | |
class Aozora_author_url(object): | |
def __init__(self): | |
self.opener = urllib2.build_opener() | |
def get_url(self,url): | |
html = self.opener.open(url).read() | |
self.mainsoup = BeautifulSoup(html) | |
def analyzer(self,workspace): | |
self.worklist = [] | |
subsoup = self.mainsoup.find('ol') | |
self.author=self.mainsoup.find('font').text | |
for item in subsoup.findAll('a'): | |
print item.text | |
#print item['href'] | |
self.worklist.append({'title':item.text,'url':'http://www.aozora.gr.jp/index_pages/' + item['href']}) | |
self.works_get = Aozora_works_get(workspace + self.author) | |
return self.author | |
def get_html(self): | |
for item in self.worklist: | |
self.works_get.get_url(item['url']) | |
self.works_get.analyzer() | |
self.works_get.download() | |
class Aozora_works_get(object): | |
def __init__(self,dir='./temp/'): | |
self.opener = urllib2.build_opener() | |
self.re_html = re.compile('.*html') | |
self.save_dir = dir | |
self.encode = System_charset() | |
def get_url(self,url): | |
html = self.opener.open(url).read() | |
self.mainsoup = BeautifulSoup(html) | |
url = url.split('/') | |
url.pop() | |
self.base_url = '/'.join(url) | |
def analyzer(self): | |
try: | |
subsoup = self.mainsoup.findAll('table')[5] | |
linksoup = subsoup.findAll('a') | |
self.author_data = self.mainsoup.find('font').text | |
for itemsoup in linksoup: | |
if self.re_html.match(itemsoup['href']) != None: | |
#print u"[DEBUG]Download XHTML => " + itemsoup['href'] | |
self.download_que = itemsoup['href'] | |
return | |
except:print '[DEBUG] Sorry, This File is Problem....' | |
self.download_que = False | |
def download(self): | |
if self.download_que: | |
#print u"[DEBUG]self.base_url => " + self.base_url | |
#print u"[DEBUG]self.download_que =>" + self.download_que | |
print u"[DEBUG]get url => " + self.base_url + '/' + self.download_que | |
urllib.urlretrieve(self.base_url + '/' + self.download_que,self.save_dir + u'/text/' + self.author_data + u".xhtml") | |
self.encode.work(self.save_dir + '/text/' + self.author_data + ".xhtml") | |
class System_charset(object): | |
""" | |
ここのURLを参考:http://www.aozora.gr.jp/index_pages/person281.html | |
""" | |
def guess_charset(self,data): | |
f = lambda d, enc: d.decode(enc) and enc | |
try: return f(data, 'utf-8') | |
except: pass | |
try: return f(data, 'shift-jis') | |
except: pass | |
try: return f(data, 'euc-jp') | |
except: pass | |
try: return f(data, 'iso2022-jp') | |
except: pass | |
return None | |
def conv(self,data): | |
charset = self.guess_charset(data) | |
u = data.decode(charset) | |
return u.encode('utf-8') | |
def work(self,data): | |
print "[DEBUG] Convert data UTF-8 Start :" + data | |
convert_data = file(data,'rU') | |
work_data = convert_data.read() | |
work_data = re.sub('Shift_JIS','UTF-8',work_data) | |
convert_data.close() | |
try: | |
work_data = self.conv(work_data) | |
except: | |
print "[DEBUG]",data,"-> skip" | |
convert_data = file(data,'w') | |
convert_data.write(work_data) | |
convert_data.close() | |
print "[DEBUG]",data,'-> Convert UTF-8' | |
class Epub_Make(object): | |
def __init__(self,workspace='./temp/'): | |
self.workspace=workspace | |
self.author = 'hogehoge' | |
def create_init(self,author): | |
workdir = self.workspace + author + '/' | |
self.author = author | |
os.mkdir(workdir) | |
os.mkdir(workdir + 'text') | |
os.mkdir(workdir + 'META-INF') | |
minetype = open(workdir + 'mimetype','w') | |
minetype.write('application/epub+zip\n') | |
minetype.close() | |
container_xml = open(workdir + 'META-INF/container.xml','w') | |
make_xml = """<?xml version='1.0' encoding="UTF-8"?> | |
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0"> | |
<rootfiles> | |
<rootfile full-path="content.opf" media-type="application/oebps-package+xml" /> | |
</rootfiles> | |
</container> | |
""" | |
container_xml.write(make_xml) | |
container_xml.close() | |
self.workdir = workdir | |
def make_opf(self,url): | |
xml_header = u"""<?xml version="1.0" encoding="UTF-8"?> | |
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId"> | |
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> | |
<dc:title>%s 全部</dc:title> | |
<dc:creator opf:role="aut">%s</dc:creator> | |
<dc:language>ja</dc:language> | |
<dc:publisher>Aozora Bunko to Epub (青空文庫) </dc:publisher> | |
<dc:identifier id="BookId">%s</dc:identifier> | |
</metadata> | |
""" % (self.author,self.author,url) | |
xml_manifest_header = u""" | |
<manifest> | |
<item id="ncx" href="toc.ncx" media-type="text/xml" /> | |
""" | |
xml_manifest_body = u"" | |
filelist = os.listdir(self.workdir + '/text/') | |
for number,file in enumerate(filelist): | |
xml_manifest_body += u"<item id='file%d' href='text/%s' media-type='application/xhtml+xml' /> " % (number + 1,str(number + 1) + '.xhtml') | |
xml_manifest_body += u'</manifest>' | |
xml_fooder = u""" | |
<spine toc="ncx"> | |
<itemref idref="file1" /> | |
</spine> | |
</package> | |
""" | |
print "[DEBUG] ---- Output OPF File ----" | |
opffile = open(self.workdir + 'content.opf','w') | |
print xml_header | |
opffile.write(xml_header.encode('utf-8')) | |
print xml_manifest_header | |
opffile.write(xml_manifest_header.encode('utf-8')) | |
print xml_manifest_body | |
opffile.write(xml_manifest_body.encode('utf-8')) | |
print xml_fooder | |
opffile.write(xml_fooder.encode('utf-8')) | |
opffile.close() | |
self.filelist = filelist | |
def make_ncx(self,url): | |
xml_header = u"""<?xml version="1.0" encoding="UTF-8" ?> | |
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | |
<head> | |
<mata name='dtb:uid' content="%s" /> | |
<meta name='dtb:depth' content="1" /> | |
<meta name='dtb:totalPageCount' content="0" /> | |
<meta name='dtb:maxPageNumber' content="0" /> | |
</head> | |
<docTitle> | |
<text>%s 全部</text> | |
</docTitle> | |
<docAuthor> | |
<text>%s</text> | |
</docAuthor> | |
""" % (url,self.author,self.author) | |
xml_navMap = u"<navMap>" | |
for number,file in enumerate(self.filelist): | |
xml_navMap += u"""<navPoint id='file%d' playOrder='%d'> | |
<navLabel> | |
<text>%s</text> | |
</navLabel> | |
<content src="text/%s" /> | |
</navPoint> | |
""" % (number + 1,number + 1,file.split('.')[0],str(number + 1)+'.xhtml') | |
os.rename(self.workdir + '/text/' + file, self.workdir + '/text/' + str(number + 1) + '.xhtml') | |
xml_navMap += u"</navMap>" | |
xml_navMap += u"</ncx>" | |
ncxfile = open(self.workdir + 'toc.ncx','w') | |
print "[DEBUG] ---- Output Ncx File ----" | |
print xml_header | |
ncxfile.write(xml_header.encode('utf-8')) | |
print xml_navMap | |
ncxfile.write(xml_navMap.encode('utf-8')) | |
ncxfile.close() | |
def make_zip(self): | |
epub_zip = zipfile.ZipFile(re.sub(' ','',self.author) + '.epub','w',zipfile.ZIP_DEFLATED) | |
filelist = os.listdir(self.workdir) | |
for file in filelist: | |
epub_zip.write(self.workdir + file,file) | |
filelist = os.listdir(self.workdir + 'META-INF') | |
for file in filelist: | |
epub_zip.write(self.workdir + 'META-INF/' + file,'./META-INF/' + file) | |
filelist = os.listdir(self.workdir + 'text') | |
for file in filelist: | |
epub_zip.write(self.workdir + 'text/' + file,'./text/' + file) | |
epub_zip.close() | |
def test(): | |
global author_url | |
test_def = Aozora_to_epub(author_url) | |
test_def.get_content() | |
test_def.make_epub() | |
if __name__ == '__main__':test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! Is this still working?