Created
December 11, 2018 14:57
-
-
Save jonathanoheix/0d359d9a21f03860e7f68c934dd13bd1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
names = [] | |
prices = [] | |
nb_in_stock = [] | |
img_urls = [] | |
categories = [] | |
ratings = [] | |
# scrape data for every book URL: this may take some time | |
for url in booksURLs: | |
soup = getAndParseURL(url) | |
# product name | |
names.append(soup.find("div", class_ = re.compile("product_main")).h1.text) | |
# product price | |
prices.append(soup.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign | |
# number of available products | |
nb_in_stock.append(re.sub("[^0-9]", "", soup.find("p", class_ = "instock availability").text)) # get rid of non numerical characters | |
# image url | |
img_urls.append(url.replace("index.html", "") + soup.find("img").get("src")) | |
# product category | |
categories.append(soup.find("a", href = re.compile("../category/books/")).get("href").split("/")[3]) | |
# ratings | |
ratings.append(soup.find("p", class_ = re.compile("star-rating")).get("class")[1]) | |
# add data into pandas df | |
import pandas as pd | |
scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings}) | |
scraped_data.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment