-
-
Save zhiyue/4217ae64a817a51e956d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: | |
# Created on 2014-09-10 12:05:42 | |
import os | |
import re | |
import json | |
import datetime | |
from libs.pprint import pprint | |
from libs.base_handler import * | |
class Handler(BaseHandler): | |
''' | |
this is a sample handler | |
''' | |
def on_start(self): | |
self.crawl('http://www.douban.com/location/china/', callback=self.location_page) | |
@config(age=60) | |
def location_page(self, response): | |
if response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A'): | |
return self.index_page(response) | |
for each in response.doc('DIV.location>A').items(): | |
if 'douban.com/location/' in each.attr.href: | |
city = each.attr.href.split('/')[-2] | |
else: | |
city = each.attr.href.split('.')[0][7:] | |
self.crawl('http://www.douban.com/location/%s/events/week-all' % city, | |
callback=self.location_page) | |
@config(age=24*60*60) | |
def index_page(self, response): | |
for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A').items(): | |
self.crawl(each.attr.href, callback=self.detail_page) | |
for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>DIV.paginator>A').items(): | |
self.crawl(each.attr.href, callback=self.index_page) | |
@config(age=24*60*60) | |
def detail_page(self, response): | |
return { | |
"url": response.url, | |
"title": response.doc('HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>H1').text(), | |
"place": response.doc("HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.micro-address").text(), | |
"time": [x.text() for x in response.doc("DIV.article>DIV.related_info DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.buy-tickets-itemcon.tickets-con-stage").items()] or [response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item").text()] if response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item") else [], | |
"price": [x.text() for x in response.doc("DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.tickets-con-price>A.buy-tickets-info").items()] or [response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")')[0].tail.strip()] if response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")') else [], | |
"mcid": response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("类型") ~ a').text(), | |
"city": response.doc('DIV.nav-primary>DIV.local-label>A.label').text(), | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: | |
# Created on 2014-10-25 14:31:24 | |
import re | |
import json | |
from libs.pprint import pprint | |
from libs.base_handler import * | |
class Handler(BaseHandler): | |
''' | |
this is a sample handler | |
''' | |
def on_start(self): | |
self.crawl('http://www.douban.com/group/haixiuzu/discussion', callback=self.index_page) | |
@every(0, 30) | |
def on_cronjob(self): | |
self.on_start() | |
@config(age=10) | |
def index_page(self, response): | |
for each in response.doc('.title a[href^="http://"]').items(): | |
self.crawl(each.attr.href, callback=self.detail_page) | |
@config(age=30*24*60*60) | |
def detail_page(self, response): | |
return { | |
"url": response.url, | |
"title": response.doc("#content h1").text(), | |
"author": response.doc(".topic-content .from a").text(), | |
"author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href, | |
"imgs": [x.attr.src for x in response.doc('.topic-doc img').items()] | |
} | |
def on_result(self, result): | |
if not result or not result['imgs']: | |
return | |
post_id = re.search("\d+", self.response.url).group(0) | |
self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST", | |
data={ | |
"short_name": "database", | |
"secret": "8e5a5be8873ad7e9a59147c3cfd10e73", | |
"posts[0][post_key]": post_id, | |
"posts[0][thread_key]": "haixiuzu", | |
"posts[0][message]": json.dumps(result).encode("base64").replace("\n", "") | |
}, callback=self.post_to_duoshuo) | |
def post_to_duoshuo(self): | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: | |
# Created on 2014-09-10 16:23:29 | |
import os | |
import re | |
import json | |
import datetime | |
from libs.pprint import pprint | |
from libs.base_handler import * | |
class Handler(BaseHandler): | |
''' | |
this is a sample handler | |
''' | |
def on_start(self): | |
for each in range(1, 34): | |
self.crawl('http://www.huodongxing.com/events?type=0&show=list&d=t1&page=%d' % each, callback=self.index_page) | |
@every(60) | |
def cronjob(self): | |
self.on_start() | |
@config(age=60*60) | |
def index_page(self, response): | |
for each in response.doc('#container > div.article > ul > li > h3 > a').items(): | |
self.crawl(each.attr.href, callback=self.detail_page) | |
@config(age=24*60*60) | |
def detail_page(self, response): | |
return { | |
"url": response.url, | |
"title": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>H2.media-heading").text(), | |
"place": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV.address>A").text(), | |
"time": [response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time").parents()[-1].text_content().strip()] if response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time") else None, | |
"price": json.loads(re.search('var eventTicketsJson\s*=\s*([^;]+)', response.text).group(1)), | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: | |
# Created on 2014-09-10 17:13:25 | |
import os | |
import re | |
import json | |
import datetime | |
from libs.pprint import pprint | |
from libs.base_handler import * | |
class Handler(BaseHandler): | |
''' | |
this is a sample handler | |
''' | |
def on_start(self): | |
self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=1', | |
callback=self.index_page) | |
@every(minutes=10) | |
def on_cron_job(self): | |
self.on_start() | |
@config(age=10*60) | |
def index_page(self, response): | |
for page in range(2, response.json['pageSize']+1): | |
self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=%d' % page, | |
callback=self.index_page) | |
for i, each in enumerate(response.json['products']): | |
self.send_message(self.project_name, { | |
"title": each['name'], | |
"ccid": each['typeaname'], | |
'mcid': each['typebname'], | |
'city': each['cityname'], | |
'place': each['vname'], | |
'time': each['likedata'].split(' '), | |
'price': each['prices'].split(','), | |
'url': "http://www.228.com.cn/ticket-%s.html" % each['productid'], | |
}, url="%s#%s" % (response.url, i)) | |
def on_message(self, project, msg): | |
return msg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment