Created
November 8, 2020 18:44
-
-
Save impshum/c0ca6b401aeb69276cd286aa3a08f630 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from splinter import Browser | |
from selenium.webdriver.chrome.options import Options | |
import csv | |
categories = [ | |
{'Arbeitsspeicher': {'pages': []}}, | |
{'Capture-Karten': {'pages': []}}, | |
{'PC-Komponenten': {'pages': ['CPUs', 'Festplatten', 'Laufwerke', 'Mainboards']}}, | |
{'Grafikkarten': {'pages': []}}, | |
{'Netzteile': {'pages': []}}, | |
{'Netzwerkkarten': {'pages': []}}, | |
{'PC-Gehäuse': {'pages': []}}, | |
{'PC-Gehäuselüfter': {'pages': []}}, | |
{'Soundkarten': {'pages': []}}, | |
{'SSD': {'pages': []}}, | |
{'Wasserkühlungen': {'pages': []}} | |
] | |
fieldnames = ['category', 'name', 'price', 'stock', 'url'] | |
csv_file = 'products.csv' | |
results = [] | |
def scraper(browser, url, cat, page): | |
browser.visit(url) | |
while True: | |
if browser.is_element_visible_by_css('.productBox', wait_time=2): | |
soup = BeautifulSoup(browser.html, 'lxml') | |
products = soup.find_all('a', {'class': 'productBox'}) | |
print(f'found {len(products)} {cat} products') | |
for a in products: | |
url = a['href'] | |
for row in a.find_all('div', {'class': 'my-3'}): | |
name = row.find('div', {'class': 'product-name'}) | |
brand = name.find('span').text | |
name = name.get_text(strip=True).replace(brand, f'{brand} ') | |
price = row.find('span', {'class': 'price'}).get_text(strip=True).replace('CHF ', '') | |
delivery_info = row.find('div', {'class': 'delivery-info'}).get_text(strip=True) | |
results.append({'category': cat, 'name': name, 'price': price, 'stock': delivery_info, 'url': url}) | |
break # END LIST PAGE | |
elif browser.is_element_visible_by_css('.product-carousel-card', wait_time=2): | |
soup = BeautifulSoup(browser.html, 'lxml') | |
products = soup.find_all('div', {'class': 'product-carousel-card'}) | |
print(f'found {len(products)} {page} products') | |
for card in products: | |
url = card.find('a', href=True)['href'] | |
brand = card.find('div', {'class': 'manufacturer'}).text | |
name = card.find('div', {'class': 'product-name'}).text | |
name = f'{brand} {name}' | |
price = card.find('div', {'class': 'price'}).get_text(strip=True).replace('CHF ', '') | |
browser.execute_script(f'window.open("{url}");') | |
browser.windows.current = browser.windows[1] | |
soup = BeautifulSoup(browser.html, 'lxml') | |
delivery_info = soup.find('b').get_text(strip=True) | |
browser.windows[1].close() | |
browser.windows.current = browser.windows[0] | |
results.append({'category': cat, 'name': name, 'price': price, 'stock': delivery_info, 'url': url}) | |
break # END CAROUSEL PAGE | |
def writer(csv_file): | |
with open(csv_file, mode='w') as csv_file: | |
writer = csv.DictWriter(csv_file, fieldnames=fieldnames) | |
writer.writeheader() | |
for result in results: | |
writer.writerow(result) | |
def main(): | |
chrome_options = Options() | |
chrome_options.add_extension('ublock_origin.crx') | |
with Browser('chrome', chrome_options=chrome_options) as browser: | |
for category in categories: | |
for cat, all_pages in category.items(): | |
pages = all_pages['pages'] | |
if pages: | |
for page in pages: | |
page_url = f'{cat}/{page}/' | |
url = f'https://www.alternate.ch/{page_url}?lpf=9999' | |
scraper(browser, url, cat, page) | |
else: | |
page = '' | |
url = f'https://www.alternate.ch/{cat}/?lpf=9999' | |
scraper(browser, url, cat, page) | |
writer(csv_file) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment