Last active
April 16, 2016 03:47
-
-
Save digglife/b0536bc9188f23de5fe005cc71de76c0 to your computer and use it in GitHub Desktop.
重新格式化经史子集中的正文和注释。脚本中以《庄子集释》为例。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding:utf-8 | |
import re | |
import os | |
import json | |
from bs4 import BeautifulSoup as bs | |
chinese_number = u"○一二三四五六七八九" | |
annotation_cats = { | |
u'注': 'zhu', | |
u'疏': 'shu', | |
u'釋': "shi", | |
u'校': "jiao" | |
} | |
def convert_inline_annotations(soup): | |
paragraphs = soup.findAll('p') | |
format_data = [] | |
text_index = 0 | |
annotation_index = 0 | |
for p in paragraphs: | |
if p.text.startswith((u'【', u'◎')): | |
annotations = format_data[text_index]['annotations'] | |
annotation = p.text | |
match = re.match(ur'^【([○一二三四五六七八九]+)】.*', annotation) | |
if match: | |
# print "=== ANNOTATOIN WITH INDEX FOUND ===" | |
# print annotation | |
annotation_index = match.group(1) | |
#annotation_index = int(''.join(map(lambda x: str(chinese_number.find(x)), annotation_index)))-1 | |
#cat = get_annotation_cat(annotation) | |
annotations[annotation_index] = remove_index(annotation) | |
#annotations[annotation_index][cat] = annotation | |
p.extract() | |
else: | |
# print "=== ANNOTATOIN WITH MARKER FOUND ===" | |
# print annotation | |
if annotations: | |
annotations[annotation_index] += remove_index(annotation) | |
p.extract() | |
else: | |
print u"This is not a inline annotation. =>{}<=".format(p.text) | |
else: | |
# print "=== MAIN TEXT FOUND: ===" | |
# print p.text | |
# skip increase index for the first main text. | |
if format_data: | |
text_index += 1 | |
format_data.append({'text': '', 'annotations': {}}) | |
format_data[text_index]['text'] = p.text | |
p.extract() | |
print json.dumps(format_data, indent=2, ensure_ascii=False) | |
soup = get_text_with_markers(format_data, soup) | |
return soup | |
def get_text_with_annotations(format_data, soup): | |
for text_index, data in enumerate(format_data): | |
annotations = data['annotations'] | |
main_text = data['text'] | |
# print main_text | |
if annotations: | |
for index, items in annotations.iteritems(): | |
#concated_annotaion = ' '.join(items.values()) | |
annotation = add_span_for_marker(items) | |
annotations_span = u"<span class='annotation'>{}</span>".format(annotation) | |
main_text = main_text.replace(u'【{}】'.format(index), annotations_span) | |
main_text_p = bs(u"<p>{}</p>".format(main_text)).p | |
soup.div.append(main_text_p) | |
return soup | |
def get_text_with_markers(format_data, soup): | |
for text_index, data in enumerate(format_data): | |
annotations = data['annotations'] | |
main_text = data['text'] | |
footnotes=[] | |
if annotations: | |
#print json.dumps(annotations, indent=2, ensure_ascii=False) | |
for index, items in sorted(annotations.iteritems()): | |
main_text = main_text.replace( | |
u'【{}】'.format(index), add_footnote_link(text_index, index) | |
) | |
footnotes.append( add_footnote_id(items, text_index, index) ) | |
main_text_p = bs(u"<p>{}</p>".format(main_text)).p | |
soup.div.append(main_text_p) | |
for footnote in footnotes: | |
footnote_p = bs(footnote) | |
soup.div.append(footnote_p) | |
return soup | |
def add_footnote_link(p_index, a_index): | |
a_index_ascii = str(chinese_number.find(a_index)) | |
footnote_link = u'<a epub:type="noteref" href="#p{0}a{1}" id="p{0}a{1}ref"><sup>{1}</sup></a>'.format(p_index, a_index_ascii) | |
return footnote_link | |
def add_footnote_id(content, p_index, a_index): | |
a_index_ascii = str(chinese_number.find(a_index)) | |
footnote = u'<aside epub:type="footnote" id="p{1}a{2}">{0}</aside>'.format(content, p_index, a_index_ascii) | |
return footnote | |
def get_annotation_cat(text): | |
match = re.search(ur'【[^○一二三四五六七八九】]+】', text) | |
cat = make_singular_character(match.group(0)).replace(u'【', '').replace(u'】', '') | |
#print cat | |
return cat | |
def add_span_for_marker(text): | |
text = make_singular_character(text) | |
for i, v in annotation_cats.iteritems(): | |
text = text.replace( | |
u'【{}】'.format(i), | |
u'<span class="common {0}">{1}</span>'.format(v, i) | |
) | |
return text | |
def remove_multiple_newines(text): | |
return re.sub(r'\n{2,}', '', text) | |
def remove_bracket(text): | |
return text.replace(u'【', '').replace(u'】', '') | |
def remove_index(text): | |
return re.sub(ur'^【[○一二三四五六七八九]+】', '', make_singular_character(text)) | |
def change_shiwen_to_shi(text): | |
return text.replace(u'【釋文】', u'【釋】') | |
def make_singular_character(text): | |
return re.sub(ur'【.*([注疏釋校]).*】', ur'【\1】', text, flags=re.UNICODE) | |
#return re.sub(ur'【([^】])[^】]+】', ur'【\1】', text, flags=re.UNICODE) | |
def main(): | |
chinese_number = u"○一二三四五六七八九" | |
ops_path = os.path.join( | |
os.path.expanduser('~'), 'Downloads', 'zhuangzi', 'OPS') | |
chapters = sorted(map(lambda x: os.path.join(ops_path, x), | |
filter( | |
lambda x: x.endswith('html'), os.listdir(ops_path)) | |
)) | |
#print chapters | |
#for c in chapters: | |
for c in chapters: | |
print "*"*100 | |
print c | |
with open(c) as f: | |
content = f.read() | |
html = bs(content, 'html.parser') | |
html = convert_inline_annotations(html) | |
if not 'xmlns:epub' in html.html.attrs: | |
html.html['xmlns:epub'] = "http://www.idpf.org/2007/ops" | |
#print html.prettify() | |
with open("{}".format(c), 'wb') as f: | |
f.write(remove_multiple_newines(str(html))) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment