Skip to content

Instantly share code, notes, and snippets.

@jonathanoheix
Created December 11, 2018 14:57
Show Gist options
  • Save jonathanoheix/0d359d9a21f03860e7f68c934dd13bd1 to your computer and use it in GitHub Desktop.
Save jonathanoheix/0d359d9a21f03860e7f68c934dd13bd1 to your computer and use it in GitHub Desktop.
names = []
prices = []
nb_in_stock = []
img_urls = []
categories = []
ratings = []
# scrape data for every book URL: this may take some time
for url in booksURLs:
soup = getAndParseURL(url)
# product name
names.append(soup.find("div", class_ = re.compile("product_main")).h1.text)
# product price
prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
# number of available products
nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
# image url
img_urls.append(url.replace("index.html", "") + soup.find("img").get("src"))
# product category
categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
# ratings
ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1])
# add data into pandas df
import pandas as pd
scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
scraped_data.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment