Skip to content

Instantly share code, notes, and snippets.

@seagatesoft
Created May 29, 2021 04:20
Show Gist options
  • Save seagatesoft/5daefacfb63bcfed19b45fde0ec6f26b to your computer and use it in GitHub Desktop.
Save seagatesoft/5daefacfb63bcfed19b45fde0ec6f26b to your computer and use it in GitHub Desktop.
Bandung.py Scrapy demo
from math import ceil
from scrapy import Request, Spider
class BookSpider(Spider):
name = 'books_toscrape_com'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.css("ol.row > li")
for book in books:
# item = dict()
# item['title'] = book.css("h3 > a::attr(title)").get()
# item['product_url'] = book.css("h3 > a::attr(href)").get()
# item['product_url'] = response.urljoin(item['product_url'])
# item['price'] = book.css("p.price_color::text").get()
detail_url = book.css("h3 > a::attr(href)").get()
detail_url = response.urljoin(detail_url)
yield Request(detail_url, callback=self.parse_detail)
if not response.meta.get('pagination'):
# berapa jumlah buku total?
total_books = response.xpath("//form[has-class('form-horizontal')]/strong[1]/text()").get()
total_books = int(total_books)
# berapa jumlah buku per halaman?
books_per_page = response.xpath("//form[has-class('form-horizontal')]/strong[3]/text()").get()
books_per_page = int(books_per_page)
# berapa jumlah total halaman = math.ceil(total_buku / buku_per_halaman)
total_pages = ceil(total_books / books_per_page)
# kirim requests untuk halaman 2 - halaman terakhir
for page_number in range(2, total_pages + 1):
page_url = f"http://books.toscrape.com/catalogue/page-{page_number}.html"
meta = dict(pagination=True)
yield Request(page_url, meta=meta, callback=self.parse)
def parse_detail(self, response):
item = dict()
item['title'] = response.css("h1::text").get()
item['stock'] = int(response.css("div.product_main p.instock.availability::text").re_first(r'\((\d+) available\)'))
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment