|
#!/usr/bin/env python3 |
|
|
|
import urllib.request |
|
import urllib.parse |
|
from bs4 import BeautifulSoup |
|
import os |
|
import copy |
|
from shlex import quote |
|
|
|
# 生成される docset の名前。カレントディレクトリに作られる |
|
new_docset_name = 'Python 3-ja.docset' |
|
# 日本語ドキュメントの URL |
|
base_url = 'https://docs.python.jp/3/' |
|
# 英語版 docset のファイルパス |
|
original_docset_path='~/Library/Application Support/Dash/DocSets/Python_3/Python 3.docset/Contents/Resources/tarix.tgz' |
|
|
|
|
|
# 日本語 html ファイルをダウンロード |
|
def get_jp_resource(doc_path): |
|
if doc_path =='genindex-Symbols.html' : |
|
doc_path = urllib.parse.quote('genindex-記号.html') |
|
url=urllib.parse.urljoin(base_url, doc_path) |
|
response = urllib.request.urlopen(url) |
|
data = response.read().decode('utf-8') |
|
return data |
|
|
|
# html ファイル1個ごとに作られる class |
|
class Scraper(object): |
|
def __init__(self, docroot_path, doc_path): |
|
super(Scraper, self).__init__() |
|
print(doc_path) |
|
self.doc_path=doc_path |
|
self.full_path=os.path.join(docroot_path, doc_path) |
|
with open(self.full_path) as fp: |
|
self.en_soup = BeautifulSoup(fp, "html.parser") |
|
jp_resource=get_jp_resource(doc_path) |
|
if jp_resource : |
|
self.ja_soup = BeautifulSoup(jp_resource, "html.parser") |
|
else : |
|
print("download failed") |
|
if self.doc_path=='genindex.html' : self.workaround_genindex_symbols() |
|
|
|
# 保存は上書き |
|
def save(self): |
|
# output = self.ja_soup.prettify(formatter=None) |
|
output = str(self.ja_soup) |
|
with open(self.full_path, 'w') as f: |
|
f.write(output) |
|
|
|
# Dash 上では意味のない script を除去 |
|
def strip_unnecessary_elements(self): |
|
scripts=self.ja_soup.find_all("script") |
|
for script in scripts : |
|
script.clear() |
|
if script.attrs and script.attrs.get('src') : |
|
if script.attrs.get('src').endswith('_static/version_switch.js') : script.decompose() |
|
elif script.attrs.get('src').endswith('_static/translations.js') : script.decompose() |
|
elif script.attrs.get('src').endswith('_static/sidebar.js') : script.decompose() |
|
|
|
# 英語版の dashAnchor 要素を探して同じ場所に埋め込む |
|
def copy_dash_anchors(self): |
|
dash_anchors = self.en_soup.find_all(attrs={"class":"dashAnchor"}) |
|
for dash_anchor in dash_anchors: |
|
ja_anchor = self.anchor_in_ja(dash_anchor['name']) |
|
if ja_anchor==None : |
|
print("ERROR: anchor not found: " + dash_anchor['name']) |
|
else : |
|
ja_anchor.insert_before(copy.copy(dash_anchor)) |
|
|
|
# dashAnchor と対応する要素を探す |
|
def anchor_in_ja(self, name): |
|
# 対象の id は dashAnchor 文字列の最後のパスと一致する |
|
label = os.path.basename(name) |
|
ja_anchor = self.ja_soup.find(attrs={"id":label}) |
|
# 一致しないものもある |
|
if ja_anchor==None : |
|
label2 = os.path.basename(os.path.dirname(name)) + '-' + label |
|
label2 = label2.lower() |
|
if name=='//apple_ref/Function/tabnanny.tokeneater' : label2='tabnanny.process_tokens' |
|
elif name=='//apple_ref/Section/async%20def' : label2='async-def' |
|
elif name=='//apple_ref/Section/async%20for' : label2='async-for' |
|
elif name=='//apple_ref/Section/async%20with' : label2='async-with' |
|
elif name=='//apple_ref/Module/xml.etree.ElementTree' : label2='module-xml.etree.ElementTree' |
|
elif name=='//apple_ref/Module/cProfile' : label2='module-cProfile' |
|
ja_anchor = self.ja_soup.find(attrs={"id":label2}) |
|
if ja_anchor==None : ja_anchor = self.ja_soup.find(attrs={"id":'c.'+label}) |
|
if ja_anchor==None : ja_anchor = self.ja_soup.find(attrs={"id":'tut-'+label}) |
|
if ja_anchor==None : ja_anchor = self.ja_soup.find(attrs={"id":'opcode-'+label}) |
|
if ja_anchor==None : ja_anchor = self.ja_soup.find(attrs={"id":'envvar-'+label}) |
|
return ja_anchor |
|
|
|
# 英語と日本語でファイル名が異なるものがあるので対処 |
|
def workaround_genindex_symbols(self): |
|
bad_links=self.ja_soup.find_all(href="genindex-記号.html") |
|
for bad_link in bad_links: |
|
bad_link.attrs['href']="genindex-Symbols.html" |
|
|
|
|
|
class DocsetJP(object): |
|
def __init__(self): |
|
super(DocsetJP, self).__init__() |
|
|
|
def start(self): |
|
if os.path.exists(new_docset_name) : |
|
print('ERROR: "'+new_docset_name+'" already exists in current directory') |
|
return False |
|
if self.copy_from_original() : |
|
self.do_main() |
|
print('All done') |
|
|
|
# 圧縮 docset の tarix.tgz を解凍。名前と Info.plist を修正して準備する |
|
def copy_from_original(self): |
|
archive_path=os.path.expanduser(original_docset_path) |
|
if not os.path.exists(archive_path) : |
|
print('ERROR: Python 3.docset (and/or tarix.tgz) not found', archive_path) |
|
return False |
|
os.system('tar zxf {}'.format(quote(archive_path))) |
|
if not os.path.exists('Python 3.docset') : |
|
print('ERROR: could not extract tarix.tgz') |
|
return False |
|
os.rename('Python 3.docset', new_docset_name) |
|
real_path=os.path.realpath(new_docset_name) |
|
self.docroot_path = os.path.join(real_path, "Contents/Resources/Documents/doc/") |
|
plist_path = os.path.join(real_path, "Contents/Info.plist") |
|
os.system('/usr/libexec/PlistBuddy -c "set :CFBundleIdentifier python_ja" ' + quote(plist_path)) |
|
os.system('/usr/libexec/PlistBuddy -c "set :CFBundleName Python 3-ja" ' + quote(plist_path)) |
|
return True |
|
|
|
# docset 内の html を探してして処理 |
|
def do_main(self): |
|
if not os.path.exists(self.docroot_path) : |
|
print('ERROR: working Python 3.docset not found', archive_path) |
|
for root, dirs, files in os.walk(self.docroot_path): |
|
for file in files: |
|
t,ext=os.path.splitext(file) |
|
if ext == ".html" : |
|
doc_path=os.path.relpath(os.path.join(root, file), self.docroot_path) |
|
scraper=Scraper(self.docroot_path, doc_path) |
|
scraper.copy_dash_anchors() |
|
scraper.strip_unnecessary_elements() |
|
scraper.save() |
|
|
|
d = DocsetJP() |
|
d.start() |
|
|