Skip to content

Instantly share code, notes, and snippets.

@z2015
Created May 2, 2016 02:08
Show Gist options
  • Save z2015/015dfd81328c0b32bdadd96772e536d5 to your computer and use it in GitHub Desktop.
Save z2015/015dfd81328c0b32bdadd96772e536d5 to your computer and use it in GitHub Desktop.
两个pyspider抓取代码
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-12-17 07:58:28
# Project: lyrce
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)#crawl everyday
def on_start(self):
self.crawl('http://www.lyrce.net/qiye/page1.html', callback=self.index_page)
def index_page(self, response):
pageLastHref = response.doc('#pagelist a:last').attr.href
pageNowText = response.doc('.pagenow').text()
pageNextText = int(pageNowText)+1
pageNowHref = response.doc('.pagenow').attr.href
if pageLastHref!=pageNowHref:
self.crawl('http://www.lyrce.net/qiye/page{}.html'.format(pageNextText),callback=self.index_page)
for each in response.doc('a[href$="zhaopin.html"]').items():
self.crawl(each.attr.href,callback=self.detail_page)
@config(priority=2,age=10 * 24 * 60 * 60)#crawl every 10 days.
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-12-17 10:14:09
# Project: lyrt2
from pyspider.libs.base_handler import *
import json
from pyspider.libs.utils import md5string
class Handler(BaseHandler):
crawl_config = {
}
def get_taskid(self, task):
return md5string(task['url']+json.dumps(task['fetch'].get('data', '')))
@every(minutes=6 * 60)
def on_start(self):
self.crawl('http://www.lyrt.com.cn/LyrtBusiness/Browse.aspx', callback=self.index_page)
@config(age=24*10*60*60)
def index_page(self, response):
if response.doc('#ctl00_ContentPlaceHolder1_rptMsg_ctl21_lbtnNext').attr('disabled')!='disabled':
#self.crawl('http://baidu.com',callback=self.index_page)
post_body = {"__EVENTTARGET":"ctl00$ContentPlaceHolder1$rptMsg$ctl21$lbtnNext",
"__VIEWSTATE":response.doc('#__VIEWSTATE').attr.value,
"__EVENTVALIDATION":response.doc("#__EVENTVALIDATION").attr.value,
"ctl00$ContentPlaceHolder1$rptMsg$ctl05$ddlPage":response.doc('select#ctl00_ContentPlaceHolder1_rptMsg_ctl21_ddlPage option:selected').text()
}
for each in response.doc('.a01').items():
self.crawl(each.attr.href, callback=self.detail_page)
self.crawl('http://www.lyrt.com.cn/LyrtBusiness/Browse.aspx',data=post_body ,callback=self.index_page)
@config(priority=2)
def detail_page(self, response):
return {
"subject": response.doc('#business_content h4 span').text(),
"publish_time":response.doc('#business_content div:eq(1) p').text(),
"content":response.doc('#business_content .h5').text(),
"contactway":response.doc('#business_content div:eq(4) p').text(),
"pic":response.doc('#pic img').attr('src'),
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment