Skip to content

Instantly share code, notes, and snippets.

@dmgottlieb
Created June 17, 2016 15:30
Show Gist options
  • Select an option

  • Save dmgottlieb/32bede1a800b11eb676dc852021411a6 to your computer and use it in GitHub Desktop.

Select an option

Save dmgottlieb/32bede1a800b11eb676dc852021411a6 to your computer and use it in GitHub Desktop.
# SEPScraper.py
# Dave Gottlieb
#
# Scrapes article text from 2016 Spring stable edition of SEP.
from lxml import html
import requests
import re
URLstem = "http://plato.stanford.edu/archives/spr2016/"
contentspage = requests.get(URLstem + "contents.html")
tree = html.fromstring(contentspage.content)
# select article URLs from TOC page
articleURLs = tree.xpath('//div[@id="content"]/ul/li/a/attribute::href')
# remove duplicates
articleURLs = list(set(articleURLs))
# For each article, download the text and save it
for p in articleURLs:
url = URLstem + p
article = requests.get(url)
filename = re.sub('entries|/', '', p)
file = open('entries/'+filename+'.html', 'w')
file.write(article.content)
file.close()
#break # so loop only runs once on test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment