Skip to content

Instantly share code, notes, and snippets.

@onlurking
Created April 17, 2018 03:08
Show Gist options
  • Save onlurking/e65d9834f69a1e0bb62ae01dfa95157d to your computer and use it in GitHub Desktop.
Save onlurking/e65d9834f69a1e0bb62ae01dfa95157d to your computer and use it in GitHub Desktop.
from requests import get
from bs4 import BeautifulSoup
from tomd import Tomd
from collections import namedtuple
def parse_page(url):
html = get(url).content
soup = BeautifulSoup(html, "html5lib")
return soup
def get_toc(url):
base = "http://plato.stanford.edu/"
links = ([base + link['href']
for link in parse_page(url)
.find('div', {'id': 'content'})
.find_all('a')
if link.has_attr('href') and 'entries/' in link['href']])
return links
def process_page(url):
soup = parse_page(url)
article = soup.find('div', {'id': 'article'})
Article = namedtuple('Article', 'title markdown author pubdate')
article = (Article(title=article.find('h1').text,
markdown=Tomd(str(article)).markdown,
author=([info for info in soup
.find('div', {'id': 'article-copyright'})
.text.strip().rstrip()
.split('\n') if len(info) > 0][1]),
pubdate=soup.find('div', {'id': 'pubinfo'}).text))
return article
def write_article(article):
file = open("{}.md".format(article.title), 'w')
for line in article.markdown:
file.write(line)
file.close()
toc_links = get_toc("https://plato.stanford.edu/contents.html")
work = (write_article(process_page(page)) for page in toc_links)
next(work)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment