Last active
September 10, 2018 01:38
-
-
Save komeda-shinji/4ef1e8fdf95731521a199f0035b9e499 to your computer and use it in GitHub Desktop.
JAWS-UG CLI 専門支部のハンズオンテキストを Jupyter Notebook 上で実行できるようにする
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import json | |
import urllib.request | |
import lxml.html | |
import html2text | |
from urllib.parse import urlparse, urljoin | |
import sys | |
crawl_url = {} | |
visit = {} | |
ipynb = { | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": None, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Bash", | |
"language": "bash", | |
"name": "bash" | |
}, | |
"language_info": { | |
"codemirror_mode": "shell", | |
"file_extension": ".sh", | |
"mimetype": "text/x-sh", | |
"name": "bash" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} | |
def markdown(source): | |
if source[0] == '\n': del source[0] | |
if source[-1] == '\n': source.pop() | |
if len(source) == 0: return None | |
if source[-1][-1] == '\n': source[-1] = source[-1][:-1] | |
return { | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": source | |
} | |
def code(source): | |
if source[0] == '\n': del source[0] | |
while source[-1] == '\n': source.pop() | |
if len(source) == 0: return None | |
if source[-1][-1] == '\n': source[-1] = source[-1][:-1] | |
# dirty rewrite | |
for i in range(0, len(source)): | |
if source[i][-5:] == '$( \\\n': | |
source[i] = source[i][0:-3] + '\n' | |
if source[i] == ') \\\n': | |
if i > 0 and source[i - 1][-3:] == ' \\\n': | |
source[i - 1] = source[i - 1][0:-3] + '\n' | |
if source[i][0:5] == ' && ': | |
if i > 0 and source[i - 1][-3:] == ' \\\n': | |
source[i - 1] = source[i - 1][0:-3] + ' && \n' | |
source[i] = source[i][5:] | |
return { | |
"cell_type": "code", | |
"execution_count": None, | |
"metadata": {}, | |
"outputs": [], | |
"source": source | |
} | |
def convert(url, cells, element): | |
for child in list(element): | |
if child.tag == 'div': | |
convert(url, cells, child) | |
elif child.tag == 'ul': | |
for item in list(child): | |
href = None | |
for e in list(item): | |
if e.tag == 'a': | |
href = e.get('href') | |
u = urlparse(href) | |
if u.scheme and u.netloc: | |
href = '{}://{}/{}'.format(u.scheme, u.netloc, u.path) | |
else: | |
href = u.path | |
if href[-5:] == '.html': | |
file = href[:-5] + '.ipynb' | |
crawl_url[file] = urljoin(url, href) | |
href = file | |
e.set('href', href) | |
content = lxml.html.tostring(item).decode().rstrip() | |
if href: | |
#print(" ==> ", crawl_url) | |
#crawl(urlparse.urljoin(url, href)) | |
if content[-5:] == '</li>': | |
content = content[:-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>' | |
else: | |
content += '<a href="{}">*</a>'.format(crawl_url[href]) | |
#print(content) | |
source = html2text.html2text(content).splitlines(keepends=True) | |
cell = markdown(source) | |
if cell: | |
cells.append(cell) | |
elif child.tag == 'pre': | |
content = lxml.html.tostring(child) | |
source = [] | |
for item in html2text.html2text(content.decode()).splitlines(keepends=True): | |
if item[0:4] == ' ': item = item[4:] | |
source.append(item) | |
#print(''.join(source)) | |
cell = code(source) | |
if cell: | |
cells.append(cell) | |
elif child.tag == 'blockquote': | |
for qc in list(child): | |
convert(url, cells, qc) | |
else: | |
content = lxml.html.tostring(child) | |
source = html2text.html2text(content.decode()).splitlines(keepends=True) | |
#print(html2text.html2text(content.decode())) | |
if source[0][-2:] == '¶\n': | |
source[0] = source[0][:-2] + '\n' | |
cell = markdown(source) | |
if cell: | |
cells.append(cell) | |
def crawl(url, file): | |
if visit.get(url): return | |
print('crawl({}, {})'.format(url, file)) | |
visit[url] = True | |
page = urllib.request.urlopen(url).read() | |
tree = lxml.html.fromstring(page) | |
body = tree.xpath("//div[@id='body']") | |
cells = [] | |
convert(url, cells, body) | |
for i, cell in enumerate(cells): | |
if cell["cell_type"] == "markdown" and \ | |
cell["source"][0] == "結果(例):" and \ | |
cells[i + 1]["cell_type"] == "code": | |
cells[i + 1] = markdown(cells[i + 1]["source"]) | |
ipynb['cells'] = cells | |
with open(file, "w") as fh: | |
fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1)) | |
for file, url in crawl_url.items(): | |
crawl(url, file) | |
if __name__ == '__main__': | |
for url in sys.argv[1:]: | |
print('get {}'.format(url)) | |
if url[0] == '/': | |
url = 'file://' + url | |
file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb' | |
crawl(url, file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment