Skip to content

Instantly share code, notes, and snippets.

View jonathanoheix's full-sized avatar

jonathanoheix

  • Macif-Mutualité
  • France
View GitHub Profile
main_url = "http://books.toscrape.com/index.html"
import requests
result = requests.get(main_url)
result.text[:1000]
from bs4 import BeautifulSoup
soup = BeautifulSoup(result.text, 'html.parser')
print(soup.prettify()[:1000])
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
soup.find("article", class_ = "product_pod")
soup.find("article", class_ = "product_pod").div.a
soup.find("article", class_ = "product_pod").div.a.get('href')
main_page_products_urls = [x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]
print(str(len(main_page_products_urls)) + " fetched products URLs")
print("One example:")
main_page_products_urls[0]
def getBooksURLs(url):
soup = getAndParseURL(url)
# remove the index.html part of the base url before returning the results
return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])
import re
categories_urls = [main_url + x.get('href') for x in soup.find_all("a", href=re.compile("catalogue/category/books"))]
categories_urls = categories_urls[1:] # we remove the first one because it corresponds to all the books
print(str(len(categories_urls)) + " fetched categories URLs")
print("Some examples:")
categories_urls[:5]
# store all the results into a list
pages_urls = [main_url]
soup = getAndParseURL(pages_urls[0])
# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or on the last page
# we stop when we get to the last page
while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1: