komeda-shinji · September 10, 2018 01:38
diff --git a/html2ipynb.py b/html2ipynb.py
 #!/usr/bin/env python

 import json
 import urllib.request
 import lxml.html
 import html2text
 from urllib.parse import urlparse, urljoin
 import sys

 crawl_url = {}
 visit = {}

 ipynb = {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Bash",
   "language": "bash",
   "name": "bash"
  },
  "language_info": {
   "codemirror_mode": "shell",
   "file_extension": ".sh",
   "mimetype": "text/x-sh",
   "name": "bash"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }

 def markdown(source):
    if source[0] == '\n': del source[0]
    if source[-1] == '\n': source.pop()
    if len(source) == 0: return None
    if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
    return {
               "cell_type": "markdown",
               "metadata": {},
               "source": source
              }

 def code(source):
    if source[0] == '\n': del source[0]
    while source[-1] == '\n': source.pop()
    if len(source) == 0: return None
    if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
    # dirty rewrite
    for i in range(0, len(source)):
        if source[i][-5:] == '$( \\\n':
            source[i] = source[i][0:-3] + '\n'
        if source[i] == ') \\\n':
            if i > 0 and source[i - 1][-3:] == ' \\\n':
                source[i - 1] = source[i - 1][0:-3] + '\n'
        if source[i][0:5] == '  && ':
            if i > 0 and source[i - 1][-3:] == ' \\\n':
                source[i - 1] = source[i - 1][0:-3] + ' && \n'
                source[i] = source[i][5:]
    return {
               "cell_type": "code",
               "execution_count": None,
               "metadata": {},
               "outputs": [],
               "source": source
              }

 def convert(url, cells, element):
    for child in list(element):
        if child.tag == 'div':
            convert(url, cells, child)
        elif child.tag == 'ul':
            for item in list(child):
                href = None
                for e in list(item):
                    if e.tag == 'a':
                        href = e.get('href')
                        u = urlparse(href)
                        if u.scheme and u.netloc:
                            href = '{}://{}/{}'.format(u.scheme, u.netloc, u.path)
                        else:
                            href = u.path
                        if href[-5:] == '.html':
                            file = href[:-5] + '.ipynb'
                            crawl_url[file] = urljoin(url, href)
                            href = file
                            e.set('href', href)
                content = lxml.html.tostring(item).decode().rstrip()
                if href:
                    #print(" ==> ", crawl_url)
                    #crawl(urlparse.urljoin(url, href))
                    if content[-5:] == '</li>':
                        content = content[:-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
                    else:
                        content += '<a href="{}">*</a>'.format(crawl_url[href])
                #print(content)
                source = html2text.html2text(content).splitlines(keepends=True)
                cell = markdown(source)
                if cell:
                    cells.append(cell)
        elif child.tag == 'pre':
            content = lxml.html.tostring(child)
            source = []
            for item in html2text.html2text(content.decode()).splitlines(keepends=True):
                if item[0:4] == '    ': item = item[4:]
                source.append(item)
            #print(''.join(source))
            cell = code(source)
            if cell:
                cells.append(cell)
        elif child.tag == 'blockquote':
            for qc in list(child):
                convert(url, cells, qc)
        else:
            content = lxml.html.tostring(child)
            source = html2text.html2text(content.decode()).splitlines(keepends=True)
            #print(html2text.html2text(content.decode()))
            if source[0][-2:] == '¶\n':
                source[0] = source[0][:-2] + '\n'
            cell = markdown(source)
            if cell:
                cells.append(cell)



 def crawl(url, file):
    if visit.get(url): return
    print('crawl({}, {})'.format(url, file))
    visit[url] = True
    page = urllib.request.urlopen(url).read()
    tree = lxml.html.fromstring(page)
    body = tree.xpath("//div[@id='body']")

    cells = []
    convert(url, cells, body)

    for i, cell in enumerate(cells):
        if cell["cell_type"] == "markdown" and \
           cell["source"][0] == "結果(例):" and \
           cells[i + 1]["cell_type"] == "code":
            cells[i + 1] = markdown(cells[i + 1]["source"])

    ipynb['cells'] = cells

    with open(file, "w") as fh:
        fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))

    for file, url in crawl_url.items():
        crawl(url, file)

 if __name__ == '__main__':
    for url in sys.argv[1:]:
        print('get {}'.format(url))
        if url[0] == '/':
            url = 'file://' + url
        file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
        crawl(url, file)
	#!/usr/bin/env python

	import json
	import urllib.request
	import lxml.html
	import html2text
	from urllib.parse import urlparse, urljoin
	import sys

	crawl_url = {}
	visit = {}

	ipynb = {
	"cells": [
	{
	"cell_type": "code",
	"execution_count": None,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Bash",
	"language": "bash",
	"name": "bash"
	},
	"language_info": {
	"codemirror_mode": "shell",
	"file_extension": ".sh",
	"mimetype": "text/x-sh",
	"name": "bash"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}

	def markdown(source):
	if source[0] == '\n': del source[0]
	if source[-1] == '\n': source.pop()
	if len(source) == 0: return None
	if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
	return {
	"cell_type": "markdown",
	"metadata": {},
	"source": source
	}

	def code(source):
	if source[0] == '\n': del source[0]
	while source[-1] == '\n': source.pop()
	if len(source) == 0: return None
	if source[-1][-1] == '\n': source[-1] = source[-1][:-1]
	# dirty rewrite
	for i in range(0, len(source)):
	if source[i][-5:] == '$( \\\n':
	source[i] = source[i][0:-3] + '\n'
	if source[i] == ') \\\n':
	if i > 0 and source[i - 1][-3:] == ' \\\n':
	source[i - 1] = source[i - 1][0:-3] + '\n'
	if source[i][0:5] == ' && ':
	if i > 0 and source[i - 1][-3:] == ' \\\n':
	source[i - 1] = source[i - 1][0:-3] + ' && \n'
	source[i] = source[i][5:]
	return {
	"cell_type": "code",
	"execution_count": None,
	"metadata": {},
	"outputs": [],
	"source": source
	}

	def convert(url, cells, element):
	for child in list(element):
	if child.tag == 'div':
	convert(url, cells, child)
	elif child.tag == 'ul':
	for item in list(child):
	href = None
	for e in list(item):
	if e.tag == 'a':
	href = e.get('href')
	u = urlparse(href)
	if u.scheme and u.netloc:
	href = '{}://{}/{}'.format(u.scheme, u.netloc, u.path)
	else:
	href = u.path
	if href[-5:] == '.html':
	file = href[:-5] + '.ipynb'
	crawl_url[file] = urljoin(url, href)
	href = file
	e.set('href', href)
	content = lxml.html.tostring(item).decode().rstrip()
	if href:
	#print(" ==> ", crawl_url)
	#crawl(urlparse.urljoin(url, href))
	if content[-5:] == '</li>':
	content = content[:-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
	else:
	content += '<a href="{}">*</a>'.format(crawl_url[href])
	#print(content)
	source = html2text.html2text(content).splitlines(keepends=True)
	cell = markdown(source)
	if cell:
	cells.append(cell)
	elif child.tag == 'pre':
	content = lxml.html.tostring(child)
	source = []
	for item in html2text.html2text(content.decode()).splitlines(keepends=True):
	if item[0:4] == ' ': item = item[4:]
	source.append(item)
	#print(''.join(source))
	cell = code(source)
	if cell:
	cells.append(cell)
	elif child.tag == 'blockquote':
	for qc in list(child):
	convert(url, cells, qc)
	else:
	content = lxml.html.tostring(child)
	source = html2text.html2text(content.decode()).splitlines(keepends=True)
	#print(html2text.html2text(content.decode()))
	if source[0][-2:] == '¶\n':
	source[0] = source[0][:-2] + '\n'
	cell = markdown(source)
	if cell:
	cells.append(cell)



	def crawl(url, file):
	if visit.get(url): return
	print('crawl({}, {})'.format(url, file))
	visit[url] = True
	page = urllib.request.urlopen(url).read()
	tree = lxml.html.fromstring(page)
	body = tree.xpath("//div[@id='body']")

	cells = []
	convert(url, cells, body)

	for i, cell in enumerate(cells):
	if cell["cell_type"] == "markdown" and \
	cell["source"][0] == "結果(例):" and \
	cells[i + 1]["cell_type"] == "code":
	cells[i + 1] = markdown(cells[i + 1]["source"])

	ipynb['cells'] = cells

	with open(file, "w") as fh:
	fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))

	for file, url in crawl_url.items():
	crawl(url, file)

	if __name__ == '__main__':
	for url in sys.argv[1:]:
	print('get {}'.format(url))
	if url[0] == '/':
	url = 'file://' + url
	file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
	crawl(url, file)