Created
November 21, 2019 17:53
-
-
Save protorob/d826ec1387e0b992f51066f810950fbe to your computer and use it in GitHub Desktop.
Scrapper for a website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import requests | |
from bs4 import BeautifulSoup | |
website = 'https://www.sito.it/bombon/bomboniere-matrimonio.html?p=' | |
output_file = open('bomboniere.csv', 'w+') | |
counter = 1 | |
for current_page in range(1,67): | |
website_url = website + str(current_page) | |
page = requests.get(website_url) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
single_product_container = soup.find_all('li', class_='item') | |
# product_title_container = soup.find_all(class_='product-name') | |
for product_container in single_product_container: | |
product_image = product_container.find(class_='product-image-area') | |
product_img_src = product_image.find('img') | |
product_link = product_image.find('a', class_='product-image').get('href') | |
#get product description | |
request_prod_page = requests.get(product_link) | |
prod_soup = BeautifulSoup(request_prod_page.text, 'html.parser') | |
#check if there's a product description | |
if prod_soup.find('div', class_='short-description'): | |
product_description = prod_soup.find('div', class_='short-description').text | |
product_description = product_description.replace("\t", " ").replace("\r", " ").replace("\n", " ") | |
else: | |
product_description = ' - ' | |
#get product image src | |
img_url = product_img_src['src'] | |
#get product title | |
product_title = product_container.find(class_='product-name').find('a').contents[0] | |
#get product code | |
product_cod = product_container.find(class_='details-area').find('p', attrs={'style':'color:#777;'}).find('b') | |
product_cod = product_cod.contents[0] | |
#format output | |
string_to_write = f'{counter};{product_cod};{product_title};{product_description};{img_url};{product_link}\n' | |
print(string_to_write) | |
output_file.write(string_to_write) | |
counter += 1 | |
output_file.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment