Skip to content

Instantly share code, notes, and snippets.

@lisongx
Created August 8, 2012 09:56
Show Gist options
  • Save lisongx/3293914 to your computer and use it in GitHub Desktop.
Save lisongx/3293914 to your computer and use it in GitHub Desktop.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from dmovie.items import MovieItem
import re
class MovieSpider(CrawlSpider):
name = 'douban_movie'
allowed_domains = ["http://movie.example.com/"]
start_urls = ['http://movie.douban.com/']
rules = [
Rule(SgmlLinkExtractor(allow=[r'/subject/\S*/?']),
follow=True,
callback='parse_movie'),
Rule(SgmlLinkExtractor(allow=[r'doulist/\d+\S+']), follow=True)
]
def parse_movie(self, response):
hxs = HtmlXPathSelector(response)
item = MovieItem()
item['title'] = hxs.select('//title/text()').extract()[0].split()[0]
item['douban_id'] = response.url.split('/')[-2]
item['complete_video'] = len(hxs.select("//a[contains(@class, 'complete_video_link')]"))
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment