onlurking · April 18, 2018 03:22
diff --git a/plato2.py b/plato2.py
 from requests import get, utils
 from bs4 import BeautifulSoup
 from tomd import Tomd
 from collections import namedtuple
 from execjs import eval, compile
 from time import sleep

 ctx = compile("""
    function markdown(source) {
        var TurndownService = require('turndown');
 		var turndownService = new TurndownService()
 		var markdown = turndownService.turndown(source)
 		return markdown
    }
 """)

 headers = utils.default_headers()
 headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'})


 def get_toc(url):
    html = get(url, headers=headers).content
    soup = BeautifulSoup(html, "html5lib")
    base = "http://plato.stanford.edu/"
    links = ([base + link['href']
              for link in soup
              .find('div', {'id': 'content'})
              .find_all('a')
              if link.has_attr('href') and 'entries/' in link['href']])
    return links

 def process_page(url):
    html = get(url, headers=headers).content
    soup = BeautifulSoup(html, "html5lib")
    article = soup.find('div', {'id': 'article'})

    Article = namedtuple('Article', 'title markdown author pubdate')
    article = (Article(title=article.find('h1').text,
                       markdown = ctx.call('markdown', str(article)),
                       author=([info for info in soup
                                .find('div', {'id': 'article-copyright'})
                                .text.strip().rstrip()
                                .split('\n') if len(info) > 0][1]),
                       pubdate=soup.find('div', {'id': 'pubinfo'}).text))
    return article


 def write_article(article):
    file = open("{}.md".format(article.title), 'w')
    for line in article.markdown:
        file.write(line)
    file.close()

 print("Generating TOC links")
 toc_links = list(set(get_toc("https://plato.stanford.edu/contents.html")))
 print("Found {} articles".format(len(toc_links)))

 for num, article in enumerate(toc_links):
    g = process_page(article)
    write_article(g)
	from requests import get, utils
	from bs4 import BeautifulSoup
	from tomd import Tomd
	from collections import namedtuple
	from execjs import eval, compile
	from time import sleep

	ctx = compile("""
	function markdown(source) {
	var TurndownService = require('turndown');
	var turndownService = new TurndownService()
	var markdown = turndownService.turndown(source)
	return markdown
	}
	""")

	headers = utils.default_headers()
	headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'})


	def get_toc(url):
	html = get(url, headers=headers).content
	soup = BeautifulSoup(html, "html5lib")
	base = "http://plato.stanford.edu/"
	links = ([base + link['href']
	for link in soup
	.find('div', {'id': 'content'})
	.find_all('a')
	if link.has_attr('href') and 'entries/' in link['href']])
	return links

	def process_page(url):
	html = get(url, headers=headers).content
	soup = BeautifulSoup(html, "html5lib")
	article = soup.find('div', {'id': 'article'})

	Article = namedtuple('Article', 'title markdown author pubdate')
	article = (Article(title=article.find('h1').text,
	markdown = ctx.call('markdown', str(article)),
	author=([info for info in soup
	.find('div', {'id': 'article-copyright'})
	.text.strip().rstrip()
	.split('\n') if len(info) > 0][1]),
	pubdate=soup.find('div', {'id': 'pubinfo'}).text))
	return article


	def write_article(article):
	file = open("{}.md".format(article.title), 'w')
	for line in article.markdown:
	file.write(line)
	file.close()

	print("Generating TOC links")
	toc_links = list(set(get_toc("https://plato.stanford.edu/contents.html")))
	print("Found {} articles".format(len(toc_links)))

	for num, article in enumerate(toc_links):
	g = process_page(article)
	write_article(g)