Created
February 17, 2013 00:48
-
-
Save cheekybastard/4969478 to your computer and use it in GitHub Desktop.
craigslist_scrapy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from myspider.items import CraigslistSampleItem | |
class MySpider(CrawlSpider): | |
name = "craigs" | |
allowed_domains = ["sfbay.craigslist.org"] | |
start_urls = ["http://sfbay.craigslist.org/"] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=("index\d+\.html")), callback="parse_items_2", follow=True), | |
Rule(SgmlLinkExtractor(allow=('\/npo')), callback="parse_items_1"), | |
) | |
def __init__(self, *a, **kw): | |
super(MySpider, self).__init__(*a, **kw) | |
def parse_items_1(self, response): | |
print response.url | |
items = [] | |
hxs = HtmlXPathSelector(response) | |
item = CraigslistSampleItem() | |
titles = hxs.select("//div") | |
for title in titles: | |
item["title"] = title.select("//li/a/text()").extract() | |
item["link"] = title.select("//li/a/@href").extract() | |
print item["title"], item["link"] | |
items.append(item) | |
return items | |
def parse_items_2(self, response): | |
print response.url | |
items = [] | |
hxs = HtmlXPathSelector(response) | |
item = CraigslistSampleItem() | |
titles = hxs.select("//p") | |
for title in titles: | |
item["title"] = title.select("a/text()").extract() | |
item["link"] = title.select("a/@href").extract() | |
print item["title"] | |
items.append(item) | |
return items |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment