Skip to content

Instantly share code, notes, and snippets.

@mebusw
Created May 15, 2017 13:02
Show Gist options
  • Save mebusw/a679e853eb1f8ef72ecc67af48d31bea to your computer and use it in GitHub Desktop.
Save mebusw/a679e853eb1f8ef72ecc67af48d31bea to your computer and use it in GitHub Desktop.
import scrapy
class CsxCourseSpider(scrapy.Spider):
name = "csx"
start_urls = ['https://www.scrumalliance.org/courses-events/course.aspx?pageCount=50&country=&state=&city=&zip=&type=Csd;&trainer=&language=&startdate=5/15/2017%2012:00:00%20AM&enddate=1/1/1900%2012:00:00%20AM&discount=False&page=1&orderby=StartDate&sortdir=asc&radius=0&view=map']
# https://www.scrumalliance.org/courses-events/courses/csd/us/ohio/columbus/2017/may/201702145-csd
def parse(self, response):
# print '}}}}}}}}', response.url, response.xpath('*//tr')
for book in response.xpath('*//tr'):
name = book.css('td > a::text').extract_first()
tds = book.css('td::text').extract()
if len(tds)>5:
trainer = tds[3].strip()
location = tds[4].strip()
date = tds[5].strip()
yield {
'name':name,
'trainer': trainer,
'location': location,
'date': date,
}
for url in response.xpath('//a[@class="UnselectedPage"]'):
u = url.xpath('./@href').extract_first()
yield scrapy.Request(response.urljoin(u), callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment