Skip to content

Instantly share code, notes, and snippets.

@insin
Created December 12, 2011 21:35
Show Gist options
  • Save insin/1469219 to your computer and use it in GitHub Desktop.
Save insin/1469219 to your computer and use it in GitHub Desktop.
Scrapy - submit form to set up cookies, then scrape known URLs...
from scrapy.http import FormRequest, Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
BASE_URL = 'http://www.url.com/'
class ThingSpider(BaseSpider):
name = 'thing'
start_urls = ['%ssearchThing.php' % BASE_URL]
def parse(self, response):
"""
Simulates submission of the initial search form.
"""
return [FormRequest.from_response(response,
callback=self.after_initial_search)]
def after_initial_search(self, response):
"""
Determines if we got a page of results and yields requests for all the
results pages, if so.
"""
if 'Search The Things' in response.body:
raise ValueError('We appear to still be on the search form after submitting it.')
for url in ['%ssearchThing.php?searching=true&pageNum=%s' % (BASE_URL, p*20)
for p in range(71)]:
yield Request(url, self.parse_search_results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment