jonathanoheix · December 11, 2018 14:57
diff --git a/scraping14.py b/scraping14.py
 names = []
 prices = []
 nb_in_stock = []
 img_urls = []
 categories = []
 ratings = []

 # scrape data for every book URL: this may take some time
 for url in booksURLs:
    soup = getAndParseURL(url)
    # product name
    names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
    # product price
    prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
    # number of available products
    nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
    # image url
    img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
    # product category
    categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
    # ratings
    ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])
    
 # add data into pandas df
 import pandas as pd

 scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
 scraped_data.head()
	names = []
	prices = []
	nb_in_stock = []
	img_urls = []
	categories = []
	ratings = []

	# scrape data for every book URL: this may take some time
	for url in booksURLs:
	soup = getAndParseURL(url)
	# product name
	names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
	# product price
	prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
	# number of available products
	nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
	# image url
	img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
	# product category
	categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
	# ratings
	ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])

	# add data into pandas df
	import pandas as pd

	scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
	scraped_data.head()