Last active
May 8, 2020 04:28
-
-
Save kizernis/7de905225c4d0051b26a35672a6ffa25 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import csv | |
import shutil | |
# import pickle | |
import requests | |
import threading | |
import concurrent.futures | |
from glob import glob | |
from tqdm import tqdm | |
from bs4 import BeautifulSoup | |
from configparser import RawConfigParser | |
config = RawConfigParser() | |
config.read('settings.cfg') | |
login = config.get('General', 'login').strip() | |
password = config.get('General', 'password').strip() | |
output_file = config.get('General', 'output_file').strip() | |
assert login and password and output_file | |
temp_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'temp') | |
if os.path.isdir(temp_dir_path): | |
shutil.rmtree(temp_dir_path) | |
os.mkdir(temp_dir_path) | |
post_data = { | |
'user': login, | |
'pass': password, | |
'permalogin': '0', | |
'logintype': 'login', | |
'pid': '4' | |
} | |
print('Logging in...') | |
session = requests.Session() | |
session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} | |
response = session.post('https://portal.ibc-solar.de/', data=post_data) | |
if 'Abmelden' not in response.text: | |
print('Unable to login.') | |
sys.exit(1) | |
# with open('cookies', 'wb') as f: | |
# pickle.dump(session.cookies, f) | |
cookies = session.cookies | |
print('Downloading subcategories...') | |
category_urls = [ | |
'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/', | |
'https://shop.ibc-solar.de/shop/de/shop/PV/Wechselrichter/', | |
'https://shop.ibc-solar.de/shop/de/shop/PV/Speicher/', | |
'https://shop.ibc-solar.de/shop/de/shop/PV/Zubehoer/' | |
] | |
subcategory_urls = [ | |
'https://shop.ibc-solar.de/shop/de/shop/PV/Solarmodule/Module/' | |
] | |
for current_category_url in tqdm(category_urls[1:]): | |
current_page_number = 1 | |
while True: | |
soup = BeautifulSoup(session.get(f'{current_category_url}?page={current_page_number}').text, 'lxml') | |
for soup_link in soup.find_all('a', class_='categorylist__item'): | |
subcategory_urls.append('https://shop.ibc-solar.de{}'.format(soup_link.get('href'))) | |
if current_page_number == 1: | |
soup_div = soup.find('div', class_='pagination__item pagination__item--text') # item? | |
if soup_div is None: | |
pages_total_number = 1 | |
else: | |
pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0]) | |
if current_page_number < pages_total_number: | |
current_page_number += 1 | |
else: | |
break | |
print('Downloading items list...') | |
item_urls = [] | |
for current_subcategory_url in tqdm(subcategory_urls): | |
current_page_number = 1 | |
while True: | |
soup = BeautifulSoup(session.get(f'{current_subcategory_url}?page={current_page_number}').text, 'lxml') | |
for soup_link in soup.find_all('a', class_='itemlist__wrapper'): | |
url = soup_link.get('href') | |
if url not in item_urls: | |
item_urls.append(url) | |
if current_page_number == 1: | |
soup_div = soup.find('div', class_='pagination__item pagination__item--text') | |
if soup_div is None: | |
pages_total_number = 1 | |
else: | |
pages_total_number = int(re.match('Seite 1 von (\d+)', soup_div.text).groups()[0]) | |
if current_page_number < pages_total_number: | |
current_page_number += 1 | |
else: | |
break | |
item_urls = list(enumerate(item_urls, start=1)) | |
# import json | |
# json.dump(item_urls, open('temp.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=1) | |
# import json | |
# item_urls = json.load(open('temp.json', encoding='utf-8')) | |
print('Downloading items...') | |
thread_local = threading.local() | |
def get_session(): | |
if not hasattr(thread_local, "session"): | |
thread_local.session = requests.Session() | |
return thread_local.session | |
def download_item_page(item_url): | |
session = get_session() | |
session.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} | |
# with open('cookies', 'rb') as f: | |
# session.cookies.update(pickle.load(f)) | |
session.cookies.update(cookies) | |
page_file_path = os.path.join(temp_dir_path, f'item{item_url[0]:07d}.html') | |
while True: | |
try: | |
with open(page_file_path, 'w', encoding='utf-8', newline='\n') as f: | |
f.write(session.get(f'https://shop.ibc-solar.de{item_url[1]}', timeout=60).text) | |
except: | |
pass | |
finally: | |
if os.path.isfile(page_file_path) and os.stat(page_file_path).st_size > 50000: | |
break | |
progress_bar.update() | |
progress_bar = tqdm(total=len(item_urls)) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: | |
executor.map(download_item_page, item_urls) | |
progress_bar.close() | |
print('Processing...') | |
csv_column_names = ('name', 'article no.', 'price', 'stock', 'future_delivery_date1', 'future_delivery_stock1', 'future_delivery_date2', 'future_delivery_stock2', 'future_delivery_date3', 'future_delivery_stock3', 'future_delivery_date4', 'future_delivery_stock4', 'future_delivery_date5', 'future_delivery_stock5', 'future_delivery_date6', 'future_delivery_stock6', 'future_delivery_date7', 'future_delivery_stock7', 'future_delivery_date8', 'future_delivery_stock8') | |
f_out = open(output_file, 'w', newline='', encoding='utf-8') | |
writer = csv.DictWriter(f_out, fieldnames=csv_column_names) | |
writer.writeheader() | |
try: | |
for file_path in tqdm(sorted(glob(os.path.join(temp_dir_path, 'item*.html')))): | |
with open(file_path, encoding='utf-8') as f_in: | |
soup = BeautifulSoup(f_in, 'lxml') | |
row = {} | |
row['name'] = soup.find('span', itemprop='name').text.strip() | |
row['article no.'] = soup.find('div', class_='itemcardItemno').text.strip().replace('Artikel-Nr.: ', '') | |
row['price'] = soup.find('div', class_='base_price').text.replace('€', '').replace(',-', ',00').strip() | |
if soup.find('div', class_='inventory__label').text.strip() == 'verfügbar': | |
row['stock'] = 'yes' | |
else: | |
row['stock'] = 0 | |
soup_div = soup.find('div', class_='table_cell datecell').parent.parent.parent.parent.parent.parent | |
soup_script = soup_div.find_all('script')[1] | |
match = re.match(r'^.+data: \[([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*),\s*([^,]*)\]', soup_script.string, flags=re.DOTALL) | |
for n, soup_cell in enumerate(soup_div.find_all('div', class_='table_cell datecell'), start=1): | |
# 20.04.-26.04.2020 | |
m = re.match('([^-]+).+(\d{4})$', soup_cell.text.strip()) | |
row[f'future_delivery_date{n}'] = m[1] + m[2] if m else '' | |
row[f'future_delivery_stock{n}'] = match[n] | |
writer.writerow(row) | |
finally: | |
f_out.close() | |
shutil.rmtree(temp_dir_path) | |
# os.remove('cookies') | |
print('Success!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment