Skip to content

Instantly share code, notes, and snippets.

@uchida
Last active December 24, 2015 09:59
Show Gist options
  • Select an option

  • Save uchida/6780499 to your computer and use it in GitHub Desktop.

Select an option

Save uchida/6780499 to your computer and use it in GitHub Desktop.
青空文庫の「公開中 作家別作品一覧拡充版」から URL を抽出し、テキストファイルを一括取得して、AozoraEpub3 https://github.com/hmdev/AozoraEpub3/ を利用して epub に変換するスクリプト
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# by Akihiro Uchida, CC0 dedicated to the public domain
# see http://creativecommons.org/publicdomain/zero/1.0/
import os.path
import time
import re
import subprocess
import atexit
import cPickle as pickle
from datetime import datetime
from zipfile import ZipFile
from StringIO import StringIO
import requests
import unicodecsv
basedir = os.path.dirname(__file__)
pkl_fpath = os.path.join(basedir, 'exit.pkl')
list_all_url = 'http://www.aozora.gr.jp/index_pages/list_person_all_extended_utf8.zip'
datefmt = '%Y-%m-%d'
def normalize_name(family, given):
name = family + given
katakana_pattern = re.compile(ur'[ァ-ヶー]+')
if katakana_pattern.match(name):
name = u'・'.join([s for s in [family, given] if s])
return name
class AozoraList(object):
def __init__(self):
self.last_exited = datetime.fromtimestamp(0)
if os.path.exists(pkl_fpath):
with open(pkl_fpath) as f:
self.last_exited = pickle.load(f)
req = requests.get(list_all_url)
zipdata = StringIO(req.content)
with ZipFile(zipdata) as zf:
basename = os.path.splitext(os.path.basename(list_all_url))[0]
encoding = 'utf-8' if basename.endswith('utf8') else cp932
with zf.open(basename + '.csv') as csv:
self.csv_reader = unicodecsv.DictReader(csv, encoding=encoding)
return
def __iter__(self):
return self
def next(self):
while True:
row = self.csv_reader.next()
item = {'author': normalize_name(row[u'姓'], row[u'名']),
'title': row[u'作品名'],
'url': row[u'テキストファイルURL'],
'modified': row[u'テキストファイル最終更新日'],
'sylab': row[u'姓読みソート用'][0],
'exited': self.last_exited}
if all(item.values()) and os.path.splitext(item['url'])[-1] == '.zip':
item['modified'] = datetime.strptime(item['modified'], datefmt)
return item
return
class ZipData(object):
def __init__(self, item):
self.fpath = os.path.join(basedir, 'zip', item['sylab'],
item['author'], item['title'] + '.zip')
if os.path.exists(self.fpath) and item['exited'] > item['modified']:
return
r = requests.get(item['url'])
time.sleep(30)
if r.status_code != 200:
return
parent_dir = os.path.dirname(self.fpath)
if not os.path.isdir(parent_dir):
os.makedirs(parent_dir)
with open(self.fpath, 'w') as f:
f.write(r.content)
return
def to_epub(self):
fpath = os.path.join(basedir, 'epub', item['sylab'],
item['author'], item['title'] + '.epub')
if os.path.exists(fpath) and item['exited'] > item['modified']:
return
parent_dir = os.path.dirname(fpath)
if not os.path.isdir(parent_dir):
os.makedirs(parent_dir)
subprocess.check_call(['java', '-Dfile.encoding=UTF-8',
'-cp', 'AozoraEpub3/AozoraEpub3.jar', 'AozoraEpub3',
'-i', 'AozoraEpub3/presets/reader.ini',
'-d', parent_dir, '-of', self.fpath])
return
@atexit.register
def save_exit_date():
with open(pkl_fpath, 'w') as f:
pickle.dump(datetime.now(), f)
if __name__ == '__main__':
for item in AozoraList():
zipped = ZipData(item)
zipped.to_epub()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment