Skip to content

Instantly share code, notes, and snippets.

@komeda-shinji
Last active September 10, 2018 01:38
Show Gist options
  • Save komeda-shinji/4ef1e8fdf95731521a199f0035b9e499 to your computer and use it in GitHub Desktop.
Save komeda-shinji/4ef1e8fdf95731521a199f0035b9e499 to your computer and use it in GitHub Desktop.
JAWS-UG CLI 専門支部のハンズオンテキストを Jupyter Notebook 上で実行できるようにする
#!/usr/bin/env python
import json
import urllib.request
import lxml.html
import html2text
from urllib.parse import urlparse, urljoin
import sys
crawl_url = {}
visit = {}
ipynb = {
"cells": [
{
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Bash",
"language": "bash",
"name": "bash"
},
"language_info": {
"codemirror_mode": "shell",
"file_extension": ".sh",
"mimetype": "text/x-sh",
"name": "bash"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
def markdown(source):
if source[0] == '\n': del source[0]
if source[-1] == '\n': source.pop()
if len(source) == 0: return None
if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
return {
"cell_type": "markdown",
"metadata": {},
"source": source
}
def code(source):
if source[0] == '\n': del source[0]
while source[-1] == '\n': source.pop()
if len(source) == 0: return None
if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
# dirty rewrite
for i in range(0, len(source)):
if source[i][-5:] == '$( \\\n':
source[i] = source[i][0:-3] + '\n'
if source[i] == ') \\\n':
if i > 0 and source[i - 1][-3:] == ' \\\n':
source[i - 1] = source[i - 1][0:-3] + '\n'
if source[i][0:5] == ' && ':
if i > 0 and source[i - 1][-3:] == ' \\\n':
source[i - 1] = source[i - 1][0:-3] + ' && \n'
source[i] = source[i][5:]
return {
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": source
}
def convert(url, cells, element):
for child in list(element):
if child.tag == 'div':
convert(url, cells, child)
elif child.tag == 'ul':
for item in list(child):
href = None
for e in list(item):
if e.tag == 'a':
href = e.get('href')
u = urlparse(href)
if u.scheme and u.netloc:
href = '{}://{}/{}'.format(u.scheme, u.netloc, u.path)
else:
href = u.path
if href[-5:] == '.html':
file = href[:-5] + '.ipynb'
crawl_url[file] = urljoin(url, href)
href = file
e.set('href', href)
content = lxml.html.tostring(item).decode().rstrip()
if href:
#print(" ==> ", crawl_url)
#crawl(urlparse.urljoin(url, href))
if content[-5:] == '</li>':
content = content[:-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
else:
content += '<a href="{}">*</a>'.format(crawl_url[href])
#print(content)
source = html2text.html2text(content).splitlines(keepends=True)
cell = markdown(source)
if cell:
cells.append(cell)
elif child.tag == 'pre':
content = lxml.html.tostring(child)
source = []
for item in html2text.html2text(content.decode()).splitlines(keepends=True):
if item[0:4] == ' ': item = item[4:]
source.append(item)
#print(''.join(source))
cell = code(source)
if cell:
cells.append(cell)
elif child.tag == 'blockquote':
for qc in list(child):
convert(url, cells, qc)
else:
content = lxml.html.tostring(child)
source = html2text.html2text(content.decode()).splitlines(keepends=True)
#print(html2text.html2text(content.decode()))
if source[0][-2:] == '¶\n':
source[0] = source[0][:-2] + '\n'
cell = markdown(source)
if cell:
cells.append(cell)
def crawl(url, file):
if visit.get(url): return
print('crawl({}, {})'.format(url, file))
visit[url] = True
page = urllib.request.urlopen(url).read()
tree = lxml.html.fromstring(page)
body = tree.xpath("//div[@id='body']")
cells = []
convert(url, cells, body)
for i, cell in enumerate(cells):
if cell["cell_type"] == "markdown" and \
cell["source"][0] == "結果(例):" and \
cells[i + 1]["cell_type"] == "code":
cells[i + 1] = markdown(cells[i + 1]["source"])
ipynb['cells'] = cells
with open(file, "w") as fh:
fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))
for file, url in crawl_url.items():
crawl(url, file)
if __name__ == '__main__':
for url in sys.argv[1:]:
print('get {}'.format(url))
if url[0] == '/':
url = 'file://' + url
file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
crawl(url, file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment