Skip to content

Instantly share code, notes, and snippets.

@luxu
Last active March 27, 2019 12:08
Show Gist options
  • Select an option

  • Save luxu/8ab7e8ff78abf4fbe4a57b4d0b8ec1bf to your computer and use it in GitHub Desktop.

Select an option

Save luxu/8ab7e8ff78abf4fbe4a57b4d0b8ec1bf to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
import re
from urllib.parse import urlparse, parse_qs
class CinemaSpider(scrapy.Spider):
name = 'cinema'
def start_requests(self):
start_urls = 'https://www.imdb.com/'
urls = [
'title/tt4123430/?ref_=nv_sr_2',
'title/tt4633694/?ref_=nv_sr_1',
]
for url in urls:
yield scrapy.Request(
url=u'{}{}'.format(start_urls,url),
callback=self.parse
)
def parse(self, response):
site = 'https://www.imdb.com'
img = response.xpath(
'//div[contains(@class, "poster")]//a//@href'
).extract_first()
self.log(u'{}{}'.format(site,img))
movie_name = response.xpath('//div[contains(@class, "title_wrapper")]//h1//text()').extract_first().strip()
image = response.xpath('//div[contains(@class, "poster")]//a//@href').extract_first()
desc = response.xpath('//div[contains(@class, "inline canwrap")]//span//text()').extract_first()
lang = response.xpath('//div[contains(@id, "titleDetails")]//div[3]//a//text()').extract()
rating = response.xpath('//span[contains(@itemprop, "ratingValue")]//text()').extract_first()
time_length = response.xpath('//div[12][contains(@class, "txt-block")]//time//text()').extract_first()
genres = response.xpath('//h4[contains(., "Genre")]/following-sibling::a//text()').extract()
img_thumb = response.xpath('//h2[contains(., "Photos")]/parent::div//a//@loadlate').extract()
yield {
'movie_name':movie_name,
'image':image,
'desc':desc,
'lang':lang,
'rating':rating,
'time_length':time_length,
'genres':genres,
'img_thumb':img_thumb,
}
link_diretor = response.url
parsed = urlparse(link_diretor)
link = parsed.path
nr = re.findall("\d",link)
nr_tt = ''.join(nr)
site_director = \
'https://www.imdb.com/title/tt{}/fullcredits?ref_=tt_ov_dr#directors/' \
.format(nr_tt)
yield scrapy.Request(
url=site_director,
callback=self.parse_director
)
site_review = 'https://www.imdb.com/title/tt{}/reviews'.format(nr_tt)
# https://www.imdb.com/title/tt4633694/reviews
yield scrapy.Request(
url=site_review,
callback=self.parse_review
)
def parse_director(self, response):
director = response.xpath('//div[contains(@id, "fullcredits_content")]//table[1]//tbody/tr/td/a/text()').extract()
cast = response.xpath('//table[contains(@class, "cast_list")]//tr[contains(@class, "odd")]//td/a/text()').extract()
yield {
'director':director,
'cast':cast,
}
def parse_review(self, response):
rev_container = response.xpath(
'//div[@class="lister-item mode-detail imdb-user-review collapsable"] \
/div[@class="review-container"]/div[@class="lister-item-content"]')[:2]
for rev in rev_container:
title = rev.xpath('a[@class="title"]/text()').extract_first().strip()
text = rev.xpath(
'div[@class="content"]/div[@class="text show-more__control"]/text()'
).extract_first()
rating = rev.xpath(
'div[@class="ipl-ratings-bar"]/span[@class="rating-other-user-rating"] \
/span[1]/text()'
).extract_first()
yield {
'title':title,
'text':text,
'rating':rating,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment