Skip to content

Instantly share code, notes, and snippets.

@cheekybastard
Created February 17, 2013 00:48
Show Gist options
  • Save cheekybastard/4969478 to your computer and use it in GitHub Desktop.
Save cheekybastard/4969478 to your computer and use it in GitHub Desktop.
craigslist_scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/"]
rules = (
Rule(SgmlLinkExtractor(allow=("index\d+\.html")), callback="parse_items_2", follow=True),
Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"),
)
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
def parse_items_1(self, response):
print response.url
items = []
hxs = HtmlXPathSelector(response)
item = CraigslistSampleItem()
titles = hxs.select("//div")
for title in titles:
item["title"] = title.select("//li/a/text()").extract()
item["link"] = title.select("//li/a/@href").extract()
print item["title"], item["link"]
items.append(item)
return items
def parse_items_2(self, response):
print response.url
items = []
hxs = HtmlXPathSelector(response)
item = CraigslistSampleItem()
titles = hxs.select("//p")
for title in titles:
item["title"] = title.select("a/text()").extract()
item["link"] = title.select("a/@href").extract()
print item["title"]
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment