luxu · March 27, 2019 12:08
diff --git a/cinema.py b/cinema.py
 # -*- coding: utf-8 -*-
 import scrapy
 import re
 from urllib.parse import urlparse, parse_qs


 class CinemaSpider(scrapy.Spider):
    name = 'cinema'

    def start_requests(self):
        start_urls = 'https://www.imdb.com/'
        urls = [
            'title/tt4123430/?ref_=nv_sr_2',
            'title/tt4633694/?ref_=nv_sr_1',
        ]
        for url in urls:
            yield scrapy.Request(
                url=u'{}{}'.format(start_urls,url),
                callback=self.parse
            )

    def parse(self, response):
        site = 'https://www.imdb.com'
        img = response.xpath(
            '//div[contains(@class, "poster")]//a//@href'
        ).extract_first()
        self.log(u'{}{}'.format(site,img))
        movie_name = response.xpath('//div[contains(@class, "title_wrapper")]//h1//text()').extract_first().strip()
        image = response.xpath('//div[contains(@class, "poster")]//a//@href').extract_first()
        desc = response.xpath('//div[contains(@class, "inline canwrap")]//span//text()').extract_first()
        lang = response.xpath('//div[contains(@id, "titleDetails")]//div[3]//a//text()').extract()
        rating = response.xpath('//span[contains(@itemprop, "ratingValue")]//text()').extract_first()
        time_length = response.xpath('//div[12][contains(@class, "txt-block")]//time//text()').extract_first()
        genres = response.xpath('//h4[contains(., "Genre")]/following-sibling::a//text()').extract()
        img_thumb = response.xpath('//h2[contains(., "Photos")]/parent::div//a//@loadlate').extract()
        yield {
            'movie_name':movie_name,
            'image':image,
            'desc':desc,
            'lang':lang,
            'rating':rating,
            'time_length':time_length,
            'genres':genres,
            'img_thumb':img_thumb,
        }
        link_diretor = response.url
        parsed = urlparse(link_diretor)
        link = parsed.path
        nr = re.findall("\d",link)
        nr_tt = ''.join(nr)
        site_director = \
            'https://www.imdb.com/title/tt{}/fullcredits?ref_=tt_ov_dr#directors/' \
            .format(nr_tt)
        yield scrapy.Request(
            url=site_director,
            callback=self.parse_director
        )
        site_review = 'https://www.imdb.com/title/tt{}/reviews'.format(nr_tt)
        # https://www.imdb.com/title/tt4633694/reviews
        yield scrapy.Request(
            url=site_review,
            callback=self.parse_review
        )

    def parse_director(self, response):
        director = response.xpath('//div[contains(@id, "fullcredits_content")]//table[1]//tbody/tr/td/a/text()').extract()
        cast = response.xpath('//table[contains(@class, "cast_list")]//tr[contains(@class, "odd")]//td/a/text()').extract()
        yield {
            'director':director,
            'cast':cast,
        }

    def parse_review(self, response):
        rev_container = response.xpath(
        '//div[@class="lister-item mode-detail imdb-user-review collapsable"] \
        /div[@class="review-container"]/div[@class="lister-item-content"]')[:2]
        for rev in rev_container:
            title = rev.xpath('a[@class="title"]/text()').extract_first().strip()
            text = rev.xpath(
                'div[@class="content"]/div[@class="text show-more__control"]/text()'
            ).extract_first()
            rating = rev.xpath(
                'div[@class="ipl-ratings-bar"]/span[@class="rating-other-user-rating"] \
                /span[1]/text()'
                ).extract_first()
            yield {
                'title':title,
                'text':text,
                'rating':rating,
            }
	# -- coding: utf-8 --
	import scrapy
	import re
	from urllib.parse import urlparse, parse_qs


	class CinemaSpider(scrapy.Spider):
	name = 'cinema'

	def start_requests(self):
	start_urls = 'https://www.imdb.com/'
	urls = [
	'title/tt4123430/?ref_=nv_sr_2',
	'title/tt4633694/?ref_=nv_sr_1',
	]
	for url in urls:
	yield scrapy.Request(
	url=u'{}{}'.format(start_urls,url),
	callback=self.parse
	)

	def parse(self, response):
	site = 'https://www.imdb.com'
	img = response.xpath(
	'//div[contains(@class, "poster")]//a//@href'
	).extract_first()
	self.log(u'{}{}'.format(site,img))
	movie_name = response.xpath('//div[contains(@class, "title_wrapper")]//h1//text()').extract_first().strip()
	image = response.xpath('//div[contains(@class, "poster")]//a//@href').extract_first()
	desc = response.xpath('//div[contains(@class, "inline canwrap")]//span//text()').extract_first()
	lang = response.xpath('//div[contains(@id, "titleDetails")]//div[3]//a//text()').extract()
	rating = response.xpath('//span[contains(@itemprop, "ratingValue")]//text()').extract_first()
	time_length = response.xpath('//div[12][contains(@class, "txt-block")]//time//text()').extract_first()
	genres = response.xpath('//h4[contains(., "Genre")]/following-sibling::a//text()').extract()
	img_thumb = response.xpath('//h2[contains(., "Photos")]/parent::div//a//@loadlate').extract()
	yield {
	'movie_name':movie_name,
	'image':image,
	'desc':desc,
	'lang':lang,
	'rating':rating,
	'time_length':time_length,
	'genres':genres,
	'img_thumb':img_thumb,
	}
	link_diretor = response.url
	parsed = urlparse(link_diretor)
	link = parsed.path
	nr = re.findall("\d",link)
	nr_tt = ''.join(nr)
	site_director = \
	'https://www.imdb.com/title/tt{}/fullcredits?ref_=tt_ov_dr#directors/' \
	.format(nr_tt)
	yield scrapy.Request(
	url=site_director,
	callback=self.parse_director
	)
	site_review = 'https://www.imdb.com/title/tt{}/reviews'.format(nr_tt)
	# https://www.imdb.com/title/tt4633694/reviews
	yield scrapy.Request(
	url=site_review,
	callback=self.parse_review
	)

	def parse_director(self, response):
	director = response.xpath('//div[contains(@id, "fullcredits_content")]//table[1]//tbody/tr/td/a/text()').extract()
	cast = response.xpath('//table[contains(@class, "cast_list")]//tr[contains(@class, "odd")]//td/a/text()').extract()
	yield {
	'director':director,
	'cast':cast,
	}

	def parse_review(self, response):
	rev_container = response.xpath(
	'//div[@class="lister-item mode-detail imdb-user-review collapsable"] \
	/div[@class="review-container"]/div[@class="lister-item-content"]')[:2]
	for rev in rev_container:
	title = rev.xpath('a[@class="title"]/text()').extract_first().strip()
	text = rev.xpath(
	'div[@class="content"]/div[@class="text show-more__control"]/text()'
	).extract_first()
	rating = rev.xpath(
	'div[@class="ipl-ratings-bar"]/span[@class="rating-other-user-rating"] \
	/span[1]/text()'
	).extract_first()
	yield {
	'title':title,
	'text':text,
	'rating':rating,
	}
No results found