JamesPHoughton · October 21, 2015 19:00
diff --git a/scrape_demo.py b/scrape_demo.py
 """ Scraping with xpath
 Here's a basic demo on how I scraped a page using xpath to get specific pieces of info.
 While I've only accessed a few single pieces of info on the page, xpath can also 
 get you a list of all the elements that meet a specific path, and will return them as
 a list or iterable.

 A tool you might like for debugging your xpath expressions is: xpath-helper
 https://chrome.google.com/webstore/detail/xpath-helper/hgimnogjllphhhkhlmebbmlgjoejdpjl
 """

 import wget
 import lxml.html as lh

 url = 'http://xforce.iss.net/xforce/xfdb/99616'

 # I find it helpful to download the page before scraping, because
 # when you're working out the scraper details, you need to load the
 # page a lot of times.
 filename = 'download_page.html'
 wget.download(url=url, out=filename)

 doc = lh.parse(filename)
 acknowledgements_str = doc.xpath("""//*[@id="main-content"]
                                    /h4[text()[contains(.,"Credit")]]
                                    /following-sibling::ul""")[0].text

 acknowledgements = [s.strip() for s in filter(None, acknowledgements_str.splitlines())]

 timeline_strs = doc.xpath("""//*[@id="main-content"]
                            /h4[text()[contains(.,"Disclosure Timeline")]]
                            /following-sibling::ul[1]
                            /text()""")

 timeline =  filter(None, [s.strip() for s in timeline_strs])

 cve = doc.xpath('//*[@id="main-content"]/ul[1]/a')[0].text_content()
	""" Scraping with xpath
	Here's a basic demo on how I scraped a page using xpath to get specific pieces of info.
	While I've only accessed a few single pieces of info on the page, xpath can also
	get you a list of all the elements that meet a specific path, and will return them as
	a list or iterable.

	A tool you might like for debugging your xpath expressions is: xpath-helper
	https://chrome.google.com/webstore/detail/xpath-helper/hgimnogjllphhhkhlmebbmlgjoejdpjl
	"""

	import wget
	import lxml.html as lh

	url = 'http://xforce.iss.net/xforce/xfdb/99616'

	# I find it helpful to download the page before scraping, because
	# when you're working out the scraper details, you need to load the
	# page a lot of times.
	filename = 'download_page.html'
	wget.download(url=url, out=filename)

	doc = lh.parse(filename)
	acknowledgements_str = doc.xpath("""//*[@id="main-content"]
	/h4[text()[contains(.,"Credit")]]
	/following-sibling::ul""")[0].text

	acknowledgements = [s.strip() for s in filter(None, acknowledgements_str.splitlines())]

	timeline_strs = doc.xpath("""//*[@id="main-content"]
	/h4[text()[contains(.,"Disclosure Timeline")]]
	/following-sibling::ul[1]
	/text()""")

	timeline = filter(None, [s.strip() for s in timeline_strs])

	cve = doc.xpath('//*[@id="main-content"]/ul[1]/a')[0].text_content()
No results found