dmgottlieb · June 17, 2016 15:30
diff --git a/SEPScraper.py b/SEPScraper.py
 # SEPScraper.py
 # Dave Gottlieb 
 #
 # Scrapes article text from 2016 Spring stable edition of SEP. 

 from lxml import html
 import requests
 import re

 URLstem = "http://plato.stanford.edu/archives/spr2016/"

 contentspage = requests.get(URLstem + "contents.html")
 tree = html.fromstring(contentspage.content)

 # select article URLs from TOC page
 articleURLs = tree.xpath('//div[@id="content"]/ul/li/a/attribute::href')

 # remove duplicates
 articleURLs = list(set(articleURLs))


 # For each article, download the text and save it
 for p in articleURLs: 
 	url = URLstem + p
 	article = requests.get(url)
 	filename = re.sub('entries|/', '', p)
 	file = open('entries/'+filename+'.html', 'w')
 	file.write(article.content)
 	file.close()
 	#break # so loop only runs once on test
	# SEPScraper.py
	# Dave Gottlieb
	#
	# Scrapes article text from 2016 Spring stable edition of SEP.

	from lxml import html
	import requests
	import re

	URLstem = "http://plato.stanford.edu/archives/spr2016/"

	contentspage = requests.get(URLstem + "contents.html")
	tree = html.fromstring(contentspage.content)

	# select article URLs from TOC page
	articleURLs = tree.xpath('//div[@id="content"]/ul/li/a/attribute::href')

	# remove duplicates
	articleURLs = list(set(articleURLs))


	# For each article, download the text and save it
	for p in articleURLs:
	url = URLstem + p
	article = requests.get(url)
	filename = re.sub('entries\|/', '', p)
	file = open('entries/'+filename+'.html', 'w')
	file.write(article.content)
	file.close()
	#break # so loop only runs once on test
No results found