Skip to content

Instantly share code, notes, and snippets.

@582033
Created August 9, 2016 09:33
Show Gist options
  • Save 582033/2cf98304e9cd11c38243cf4cf9e066c8 to your computer and use it in GitHub Desktop.
Save 582033/2cf98304e9cd11c38243cf4cf9e066c8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spider import Spider#, Rule
from itjuzi.items import ItjuziItem
from itjuzi.settings import *
class spider(Spider):
name = "itjuzi"
allow_domains = [
"www.itjuzi.com"
]
start_urls = [
"http://www.itjuzi.com/company?page=1"
]
def parse(self, response):
if response.status == 200:
for sel in response.xpath('//p[@class="title"]/a/span/text()').extract():
item = ItjuziItem()
item['company_name'] = sel
yield item
next_page_xpath = response.xpath('//div[contains(@class, "ui-pagechange")]/a')
for i in next_page_xpath:
if i.xpath('text()').re(u'.*下一页.*'):
next_page = i.xpath('@href').extract()[0]
if next_page:
print "next_page: %s" % next_page
yield scrapy.Request(next_page, callback='parse')
@582033
Copy link
Author

582033 commented Aug 10, 2016

找到错误了,callback写错了

应该是 yield scrapy.Request(next_page, callback=self.parse)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment