Skip to content

Instantly share code, notes, and snippets.

@syshack
Last active August 29, 2015 14:00
Show Gist options
  • Save syshack/7791c6ed7f2d39632751 to your computer and use it in GitHub Desktop.
Save syshack/7791c6ed7f2d39632751 to your computer and use it in GitHub Desktop.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from jdspider.items import ResItem
class CheckResSpider(CrawlSpider):
name = 'check_res'
def __init__(self, url=None, check=None, *args, **kwargs):
super(CheckResSpider, self).__init__(*args, **kwargs)
print check
self.check = check.split(',')
self.start_urls = [url]
print(self.start_urls)
self.rules = (
# Rule(SgmlLinkExtractor(allow=r'/'), callback='parse_item', follow=True),
# Rule(SgmlLinkExtractor(tags='img', attrs='src'), callback='check_item', follow=True),
Rule(SgmlLinkExtractor(tags=('script',), attrs=('src',)), callback='check_item', follow=True),
# Rule(SgmlLinkExtractor(tags='link', attrs='href'), callback='check_item', follow=True)
)
# def parse_item(self, response):
# i = ResItem()
# i["url"] = response.url
# print response.url
# pass
def check_item(self, response):
i = ResItem()
print response.url
if response.status >= 400:
i["res"] = response.url
try:
refer = response.request.headers["Referer"]
i["url"] = refer
except KeyError:
i["url"] = ""
return i
else:
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment