Last active
December 24, 2015 09:59
-
-
Save uchida/6780499 to your computer and use it in GitHub Desktop.
青空文庫の「公開中 作家別作品一覧拡充版」から URL を抽出し、テキストファイルを一括取得して、AozoraEpub3 https://github.com/hmdev/AozoraEpub3/ を利用して epub に変換するスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # by Akihiro Uchida, CC0 dedicated to the public domain | |
| # see http://creativecommons.org/publicdomain/zero/1.0/ | |
| import os.path | |
| import time | |
| import re | |
| import subprocess | |
| import atexit | |
| import cPickle as pickle | |
| from datetime import datetime | |
| from zipfile import ZipFile | |
| from StringIO import StringIO | |
| import requests | |
| import unicodecsv | |
| basedir = os.path.dirname(__file__) | |
| pkl_fpath = os.path.join(basedir, 'exit.pkl') | |
| list_all_url = 'http://www.aozora.gr.jp/index_pages/list_person_all_extended_utf8.zip' | |
| datefmt = '%Y-%m-%d' | |
| def normalize_name(family, given): | |
| name = family + given | |
| katakana_pattern = re.compile(ur'[ァ-ヶー]+') | |
| if katakana_pattern.match(name): | |
| name = u'・'.join([s for s in [family, given] if s]) | |
| return name | |
| class AozoraList(object): | |
| def __init__(self): | |
| self.last_exited = datetime.fromtimestamp(0) | |
| if os.path.exists(pkl_fpath): | |
| with open(pkl_fpath) as f: | |
| self.last_exited = pickle.load(f) | |
| req = requests.get(list_all_url) | |
| zipdata = StringIO(req.content) | |
| with ZipFile(zipdata) as zf: | |
| basename = os.path.splitext(os.path.basename(list_all_url))[0] | |
| encoding = 'utf-8' if basename.endswith('utf8') else cp932 | |
| with zf.open(basename + '.csv') as csv: | |
| self.csv_reader = unicodecsv.DictReader(csv, encoding=encoding) | |
| return | |
| def __iter__(self): | |
| return self | |
| def next(self): | |
| while True: | |
| row = self.csv_reader.next() | |
| item = {'author': normalize_name(row[u'姓'], row[u'名']), | |
| 'title': row[u'作品名'], | |
| 'url': row[u'テキストファイルURL'], | |
| 'modified': row[u'テキストファイル最終更新日'], | |
| 'sylab': row[u'姓読みソート用'][0], | |
| 'exited': self.last_exited} | |
| if all(item.values()) and os.path.splitext(item['url'])[-1] == '.zip': | |
| item['modified'] = datetime.strptime(item['modified'], datefmt) | |
| return item | |
| return | |
| class ZipData(object): | |
| def __init__(self, item): | |
| self.fpath = os.path.join(basedir, 'zip', item['sylab'], | |
| item['author'], item['title'] + '.zip') | |
| if os.path.exists(self.fpath) and item['exited'] > item['modified']: | |
| return | |
| r = requests.get(item['url']) | |
| time.sleep(30) | |
| if r.status_code != 200: | |
| return | |
| parent_dir = os.path.dirname(self.fpath) | |
| if not os.path.isdir(parent_dir): | |
| os.makedirs(parent_dir) | |
| with open(self.fpath, 'w') as f: | |
| f.write(r.content) | |
| return | |
| def to_epub(self): | |
| fpath = os.path.join(basedir, 'epub', item['sylab'], | |
| item['author'], item['title'] + '.epub') | |
| if os.path.exists(fpath) and item['exited'] > item['modified']: | |
| return | |
| parent_dir = os.path.dirname(fpath) | |
| if not os.path.isdir(parent_dir): | |
| os.makedirs(parent_dir) | |
| subprocess.check_call(['java', '-Dfile.encoding=UTF-8', | |
| '-cp', 'AozoraEpub3/AozoraEpub3.jar', 'AozoraEpub3', | |
| '-i', 'AozoraEpub3/presets/reader.ini', | |
| '-d', parent_dir, '-of', self.fpath]) | |
| return | |
| @atexit.register | |
| def save_exit_date(): | |
| with open(pkl_fpath, 'w') as f: | |
| pickle.dump(datetime.now(), f) | |
| if __name__ == '__main__': | |
| for item in AozoraList(): | |
| zipped = ZipData(item) | |
| zipped.to_epub() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment