Created
June 17, 2016 15:30
-
-
Save dmgottlieb/32bede1a800b11eb676dc852021411a6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SEPScraper.py | |
| # Dave Gottlieb | |
| # | |
| # Scrapes article text from 2016 Spring stable edition of SEP. | |
| from lxml import html | |
| import requests | |
| import re | |
| URLstem = "http://plato.stanford.edu/archives/spr2016/" | |
| contentspage = requests.get(URLstem + "contents.html") | |
| tree = html.fromstring(contentspage.content) | |
| # select article URLs from TOC page | |
| articleURLs = tree.xpath('//div[@id="content"]/ul/li/a/attribute::href') | |
| # remove duplicates | |
| articleURLs = list(set(articleURLs)) | |
| # For each article, download the text and save it | |
| for p in articleURLs: | |
| url = URLstem + p | |
| article = requests.get(url) | |
| filename = re.sub('entries|/', '', p) | |
| file = open('entries/'+filename+'.html', 'w') | |
| file.write(article.content) | |
| file.close() | |
| #break # so loop only runs once on test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment