Created
May 2, 2016 02:08
-
-
Save z2015/015dfd81328c0b32bdadd96772e536d5 to your computer and use it in GitHub Desktop.
两个pyspider抓取代码
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# Created on 2015-12-17 07:58:28 | |
# Project: lyrce | |
from pyspider.libs.base_handler import * | |
class Handler(BaseHandler): | |
crawl_config = { | |
} | |
@every(minutes=24 * 60)#crawl everyday | |
def on_start(self): | |
self.crawl('http://www.lyrce.net/qiye/page1.html', callback=self.index_page) | |
def index_page(self, response): | |
pageLastHref = response.doc('#pagelist a:last').attr.href | |
pageNowText = response.doc('.pagenow').text() | |
pageNextText = int(pageNowText)+1 | |
pageNowHref = response.doc('.pagenow').attr.href | |
if pageLastHref!=pageNowHref: | |
self.crawl('http://www.lyrce.net/qiye/page{}.html'.format(pageNextText),callback=self.index_page) | |
for each in response.doc('a[href$="zhaopin.html"]').items(): | |
self.crawl(each.attr.href,callback=self.detail_page) | |
@config(priority=2,age=10 * 24 * 60 * 60)#crawl every 10 days. | |
def detail_page(self, response): | |
return { | |
"url": response.url, | |
"title": response.doc('title').text(), | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# Created on 2015-12-17 10:14:09 | |
# Project: lyrt2 | |
from pyspider.libs.base_handler import * | |
import json | |
from pyspider.libs.utils import md5string | |
class Handler(BaseHandler): | |
crawl_config = { | |
} | |
def get_taskid(self, task): | |
return md5string(task['url']+json.dumps(task['fetch'].get('data', ''))) | |
@every(minutes=6 * 60) | |
def on_start(self): | |
self.crawl('http://www.lyrt.com.cn/LyrtBusiness/Browse.aspx', callback=self.index_page) | |
@config(age=24*10*60*60) | |
def index_page(self, response): | |
if response.doc('#ctl00_ContentPlaceHolder1_rptMsg_ctl21_lbtnNext').attr('disabled')!='disabled': | |
#self.crawl('http://baidu.com',callback=self.index_page) | |
post_body = {"__EVENTTARGET":"ctl00$ContentPlaceHolder1$rptMsg$ctl21$lbtnNext", | |
"__VIEWSTATE":response.doc('#__VIEWSTATE').attr.value, | |
"__EVENTVALIDATION":response.doc("#__EVENTVALIDATION").attr.value, | |
"ctl00$ContentPlaceHolder1$rptMsg$ctl05$ddlPage":response.doc('select#ctl00_ContentPlaceHolder1_rptMsg_ctl21_ddlPage option:selected').text() | |
} | |
for each in response.doc('.a01').items(): | |
self.crawl(each.attr.href, callback=self.detail_page) | |
self.crawl('http://www.lyrt.com.cn/LyrtBusiness/Browse.aspx',data=post_body ,callback=self.index_page) | |
@config(priority=2) | |
def detail_page(self, response): | |
return { | |
"subject": response.doc('#business_content h4 span').text(), | |
"publish_time":response.doc('#business_content div:eq(1) p').text(), | |
"content":response.doc('#business_content .h5').text(), | |
"contactway":response.doc('#business_content div:eq(4) p').text(), | |
"pic":response.doc('#pic img').attr('src'), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment