jonathanoheix

32 followers · 12 following

Macif-Mutualité
France

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

jonathanoheix / scraping1.py

Created December 11, 2018 14:53

	main_url = "http://books.toscrape.com/index.html"

	import requests
	result = requests.get(main_url)

	result.text[:1000]

jonathanoheix / scraping2.py

Created December 11, 2018 14:53

	from bs4 import BeautifulSoup
	soup = BeautifulSoup(result.text, 'html.parser')

	print(soup.prettify()[:1000])

jonathanoheix / scraping3.py

Created December 11, 2018 14:54

	def getAndParseURL(url):
	result = requests.get(url)
	soup = BeautifulSoup(result.text, 'html.parser')
	return(soup)

jonathanoheix / scraping4.py

Created December 11, 2018 14:54

soup.find("article", class_ = "product_pod")

jonathanoheix / scraping5.py

Created December 11, 2018 14:54

soup.find("article", class_ = "product_pod").div.a

jonathanoheix / scraping6.py

Created December 11, 2018 14:54

soup.find("article", class_ = "product_pod").div.a.get('href')

jonathanoheix / scraping7.py

Created December 11, 2018 14:54

	main_page_products_urls = [x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")]

	print(str(len(main_page_products_urls)) + " fetched products URLs")
	print("One example:")
	main_page_products_urls[0]

jonathanoheix / scraping8.py

Created December 11, 2018 14:55

	def getBooksURLs(url):
	soup = getAndParseURL(url)
	# remove the index.html part of the base url before returning the results
	return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in soup.findAll("article", class_ = "product_pod")])

jonathanoheix / scraping9.py

Created December 11, 2018 14:55

	import re

	categories_urls = [main_url + x.get('href') for x in soup.find_all("a", href=re.compile("catalogue/category/books"))]
	categories_urls = categories_urls[1:] # we remove the first one because it corresponds to all the books

	print(str(len(categories_urls)) + " fetched categories URLs")
	print("Some examples:")
	categories_urls[:5]

jonathanoheix / scraping10.py

Created December 11, 2018 14:55

	# store all the results into a list
	pages_urls = [main_url]

	soup = getAndParseURL(pages_urls[0])

	# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
	# if there is only one button, this means that we are either on the first page or on the last page
	# we stop when we get to the last page

	while len(soup.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:

OlderNewer