Last active
March 27, 2019 12:08
-
-
Save luxu/8ab7e8ff78abf4fbe4a57b4d0b8ec1bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import scrapy | |
| import re | |
| from urllib.parse import urlparse, parse_qs | |
| class CinemaSpider(scrapy.Spider): | |
| name = 'cinema' | |
| def start_requests(self): | |
| start_urls = 'https://www.imdb.com/' | |
| urls = [ | |
| 'title/tt4123430/?ref_=nv_sr_2', | |
| 'title/tt4633694/?ref_=nv_sr_1', | |
| ] | |
| for url in urls: | |
| yield scrapy.Request( | |
| url=u'{}{}'.format(start_urls,url), | |
| callback=self.parse | |
| ) | |
| def parse(self, response): | |
| site = 'https://www.imdb.com' | |
| img = response.xpath( | |
| '//div[contains(@class, "poster")]//a//@href' | |
| ).extract_first() | |
| self.log(u'{}{}'.format(site,img)) | |
| movie_name = response.xpath('//div[contains(@class, "title_wrapper")]//h1//text()').extract_first().strip() | |
| image = response.xpath('//div[contains(@class, "poster")]//a//@href').extract_first() | |
| desc = response.xpath('//div[contains(@class, "inline canwrap")]//span//text()').extract_first() | |
| lang = response.xpath('//div[contains(@id, "titleDetails")]//div[3]//a//text()').extract() | |
| rating = response.xpath('//span[contains(@itemprop, "ratingValue")]//text()').extract_first() | |
| time_length = response.xpath('//div[12][contains(@class, "txt-block")]//time//text()').extract_first() | |
| genres = response.xpath('//h4[contains(., "Genre")]/following-sibling::a//text()').extract() | |
| img_thumb = response.xpath('//h2[contains(., "Photos")]/parent::div//a//@loadlate').extract() | |
| yield { | |
| 'movie_name':movie_name, | |
| 'image':image, | |
| 'desc':desc, | |
| 'lang':lang, | |
| 'rating':rating, | |
| 'time_length':time_length, | |
| 'genres':genres, | |
| 'img_thumb':img_thumb, | |
| } | |
| link_diretor = response.url | |
| parsed = urlparse(link_diretor) | |
| link = parsed.path | |
| nr = re.findall("\d",link) | |
| nr_tt = ''.join(nr) | |
| site_director = \ | |
| 'https://www.imdb.com/title/tt{}/fullcredits?ref_=tt_ov_dr#directors/' \ | |
| .format(nr_tt) | |
| yield scrapy.Request( | |
| url=site_director, | |
| callback=self.parse_director | |
| ) | |
| site_review = 'https://www.imdb.com/title/tt{}/reviews'.format(nr_tt) | |
| # https://www.imdb.com/title/tt4633694/reviews | |
| yield scrapy.Request( | |
| url=site_review, | |
| callback=self.parse_review | |
| ) | |
| def parse_director(self, response): | |
| director = response.xpath('//div[contains(@id, "fullcredits_content")]//table[1]//tbody/tr/td/a/text()').extract() | |
| cast = response.xpath('//table[contains(@class, "cast_list")]//tr[contains(@class, "odd")]//td/a/text()').extract() | |
| yield { | |
| 'director':director, | |
| 'cast':cast, | |
| } | |
| def parse_review(self, response): | |
| rev_container = response.xpath( | |
| '//div[@class="lister-item mode-detail imdb-user-review collapsable"] \ | |
| /div[@class="review-container"]/div[@class="lister-item-content"]')[:2] | |
| for rev in rev_container: | |
| title = rev.xpath('a[@class="title"]/text()').extract_first().strip() | |
| text = rev.xpath( | |
| 'div[@class="content"]/div[@class="text show-more__control"]/text()' | |
| ).extract_first() | |
| rating = rev.xpath( | |
| 'div[@class="ipl-ratings-bar"]/span[@class="rating-other-user-rating"] \ | |
| /span[1]/text()' | |
| ).extract_first() | |
| yield { | |
| 'title':title, | |
| 'text':text, | |
| 'rating':rating, | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment