widnyana · January 6, 2016 07:23
diff --git a/imdb_next_page_spider.py b/imdb_next_page_spider.py
 '''
 Spider for IMDb
 - Retrieve most popular movies & TV series with rating of 8.0 and above
 - Crawl next pages recursively
 '''

 from scrapy.contrib.spiders import CrawlSpider, Rule
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.selector import Selector

 from scrapy_tutorial.items import ScrapyTutorialItem

 class IMDbNextPageSpider(CrawlSpider):

 	name = "imdbnextpage"
 	allowed_domains = ["imdb.com"]
 	start_urls = [
 		"http://www.imdb.com/search/title?count=20&start=1&title_type=feature,tv_series"
 	]
 	rules = (
 		# Extract links for next pages
 		Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[contains(@class, "leftright")][1]//a[contains(., "Next")]')), callback='parse_listings', follow=True),
 	)

 	def parse_start_url(self, response):
 		'''
 		Crawl start_urls
 		'''

 		return self.parse_listings(response)

 	def parse_listings(self, response):
 		'''
 		Extract data from listing pages
 		'''

 		sel = Selector(response)
 		films = sel.xpath('//table[contains(@class, "results")]//tr[contains(@class, "detailed")]')
 		items = []

 		for film in films:
 			# Populate film fields
 			item = ScrapyTutorialItem()
 			item['title'] = film.xpath('.//td[contains(@class, "title")]/a/text()').extract()
 			item['year'] = film.xpath('.//span[contains(@class, "year_type")]/text()').extract()
 			item['rating'] = film.xpath('.//span[contains(@class, "rating-rating")]/span[contains(@class, "value")]/text()').extract()
 			item['description'] = film.xpath('.//span[contains(@class, "outline")]/text()').extract()
 			item['poster_url'] = film.xpath('.//td[contains(@class, "image")]//img/@src').extract()
 			item['film_url'] = film.xpath('.//td[contains(@class, "title")]/a/@href').extract()
 			item = self.__normalise_item(item, response.url)

 			# Get films with rating of 8.0 and above
 			if item['rating'] > 8:
 				items.append(item)

 		return items

 	def __normalise_item(self, item, base_url):
 		'''
 		Standardise and format item fields
 		'''

 		# Loop item fields to sanitise data and standardise data types
 		for key, value in vars(item).values()[0].iteritems():
 			item[key] = self.__normalise(item[key])

 		# Clean year and convert year from string to float
 		item['year'] = item['year'].strip('()')
 		item['type'] = 'Movie'

 		if len(item['year']) > 4:
 			item['type'] = item['year'][5:]
 			item['year'] = item['year'][0:4]
 		item['year'] = self.__to_int(item['year'])

 		# Convert rating from string to float
 		item['rating'] = self.__to_float(item['rating'])

 		# Convert film URL from relative to absolute URL
 		item['film_url'] = self.__to_absolute_url(base_url, item['film_url'])

 		return item

 	def __normalise(self, value):
 		# Convert list to string
 		value = value if type(value) is not list else ' '.join(value)
 		# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
 		value = value.strip()

 		return value

 	def __to_absolute_url(self, base_url, link):
 		'''
 		Convert relative URL to absolute URL
 		'''

 		import urlparse

 		link = urlparse.urljoin(base_url, link)

 		return link

 	def __to_int(self, value):
 		'''
 		Convert value to integer type
 		'''

 		try:
 			value = int(value)
 		except ValueError:
 			value = 0

 		return value

 	def __to_float(self, value):
 		'''
 		Convert value to float type
 		'''

 		try:
 			value = float(value)
 		except ValueError:
 			value = 0.0

 		return value
	'''
	Spider for IMDb
	- Retrieve most popular movies & TV series with rating of 8.0 and above
	- Crawl next pages recursively
	'''

	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import Selector

	from scrapy_tutorial.items import ScrapyTutorialItem

	class IMDbNextPageSpider(CrawlSpider):

	name = "imdbnextpage"
	allowed_domains = ["imdb.com"]
	start_urls = [
	"http://www.imdb.com/search/title?count=20&start=1&title_type=feature,tv_series"
	]
	rules = (
	# Extract links for next pages
	Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[contains(@class, "leftright")][1]//a[contains(., "Next")]')), callback='parse_listings', follow=True),
	)

	def parse_start_url(self, response):
	'''
	Crawl start_urls
	'''

	return self.parse_listings(response)

	def parse_listings(self, response):
	'''
	Extract data from listing pages
	'''

	sel = Selector(response)
	films = sel.xpath('//table[contains(@class, "results")]//tr[contains(@class, "detailed")]')
	items = []

	for film in films:
	# Populate film fields
	item = ScrapyTutorialItem()
	item['title'] = film.xpath('.//td[contains(@class, "title")]/a/text()').extract()
	item['year'] = film.xpath('.//span[contains(@class, "year_type")]/text()').extract()
	item['rating'] = film.xpath('.//span[contains(@class, "rating-rating")]/span[contains(@class, "value")]/text()').extract()
	item['description'] = film.xpath('.//span[contains(@class, "outline")]/text()').extract()
	item['poster_url'] = film.xpath('.//td[contains(@class, "image")]//img/@src').extract()
	item['film_url'] = film.xpath('.//td[contains(@class, "title")]/a/@href').extract()
	item = self.__normalise_item(item, response.url)

	# Get films with rating of 8.0 and above
	if item['rating'] > 8:
	items.append(item)

	return items

	def __normalise_item(self, item, base_url):
	'''
	Standardise and format item fields
	'''

	# Loop item fields to sanitise data and standardise data types
	for key, value in vars(item).values()[0].iteritems():
	item[key] = self.__normalise(item[key])

	# Clean year and convert year from string to float
	item['year'] = item['year'].strip('()')
	item['type'] = 'Movie'

	if len(item['year']) > 4:
	item['type'] = item['year'][5:]
	item['year'] = item['year'][0:4]
	item['year'] = self.__to_int(item['year'])

	# Convert rating from string to float
	item['rating'] = self.__to_float(item['rating'])

	# Convert film URL from relative to absolute URL
	item['film_url'] = self.__to_absolute_url(base_url, item['film_url'])

	return item

	def __normalise(self, value):
	# Convert list to string
	value = value if type(value) is not list else ' '.join(value)
	# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
	value = value.strip()

	return value

	def __to_absolute_url(self, base_url, link):
	'''
	Convert relative URL to absolute URL
	'''

	import urlparse

	link = urlparse.urljoin(base_url, link)

	return link

	def __to_int(self, value):
	'''
	Convert value to integer type
	'''

	try:
	value = int(value)
	except ValueError:
	value = 0

	return value

	def __to_float(self, value):
	'''
	Convert value to float type
	'''

	try:
	value = float(value)
	except ValueError:
	value = 0.0

	return value