Skip to content

Instantly share code, notes, and snippets.

@onlurking
Created April 18, 2018 03:22
Show Gist options
  • Save onlurking/2d66590e3a78d471c4cb39cff9cfaca2 to your computer and use it in GitHub Desktop.
Save onlurking/2d66590e3a78d471c4cb39cff9cfaca2 to your computer and use it in GitHub Desktop.
from requests import get, utils
from bs4 import BeautifulSoup
from tomd import Tomd
from collections import namedtuple
from execjs import eval, compile
from time import sleep
ctx = compile("""
function markdown(source) {
var TurndownService = require('turndown');
var turndownService = new TurndownService()
var markdown = turndownService.turndown(source)
return markdown
}
""")
headers = utils.default_headers()
headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'})
def get_toc(url):
html = get(url, headers=headers).content
soup = BeautifulSoup(html, "html5lib")
base = "http://plato.stanford.edu/"
links = ([base + link['href']
for link in soup
.find('div', {'id': 'content'})
.find_all('a')
if link.has_attr('href') and 'entries/' in link['href']])
return links
def process_page(url):
html = get(url, headers=headers).content
soup = BeautifulSoup(html, "html5lib")
article = soup.find('div', {'id': 'article'})
Article = namedtuple('Article', 'title markdown author pubdate')
article = (Article(title=article.find('h1').text,
markdown = ctx.call('markdown', str(article)),
author=([info for info in soup
.find('div', {'id': 'article-copyright'})
.text.strip().rstrip()
.split('\n') if len(info) > 0][1]),
pubdate=soup.find('div', {'id': 'pubinfo'}).text))
return article
def write_article(article):
file = open("{}.md".format(article.title), 'w')
for line in article.markdown:
file.write(line)
file.close()
print("Generating TOC links")
toc_links = list(set(get_toc("https://plato.stanford.edu/contents.html")))
print("Found {} articles".format(len(toc_links)))
for num, article in enumerate(toc_links):
g = process_page(article)
write_article(g)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment