Skip to content

Instantly share code, notes, and snippets.

@flxai
Created December 9, 2019 13:21
Show Gist options
  • Save flxai/268f7893316fc20050d30dcdc4834f54 to your computer and use it in GitHub Desktop.
Save flxai/268f7893316fc20050d30dcdc4834f54 to your computer and use it in GitHub Desktop.
Grab data for a category from Geizhals
#!/usr/bin/env python3
import re
import requests
import sys
import time
from bs4 import BeautifulSoup
from matplotlib.lines import Line2D
import numpy as np
import pylab as plt
URL_CATEGORY = 'https://geizhals.de/?cat=hhleuchtled&v=e&hloc=at&hloc=de&sort=t&pg=2#productlist'
PAGE_NEXT_STR = 'vor »'
SLEEP_BETWEEN_REQUESTS = 0
def read_prices(url_category):
url_next = url_category
last_page = False
while not last_page:
time.sleep(SLEEP_BETWEEN_REQUESTS)
r = requests.get(url_next)
if r.status_code != 200:
print(f"[ERROR] Could not download page: {url_next}")
return
s = BeautifulSoup(r.text, 'html.parser')
prices_bs = s.find_all(class_='productlist__price')
prices_raw = []
for price in prices_bs:
s1 = price.find('span')
if s1 is None:
continue
s3 = s1.find('span').find('span')
prices_raw.append(s3.get_text())
prices = [float(price.replace(',', '.').split(' ')[-1]) for price in prices_raw]
last_page_bs = s.find(class_='gh_pag_wrp', recursive=True).find_all('li')[-1]
last_page_text = last_page_bs.a.get_text()
if last_page_text != PAGE_NEXT_STR:
last_page = True
yield prices
url_dir = '/'.join(url_category.split('/')[:-1])
url_next = last_page_bs.a['href'].split('?')[-1]
url_next = f'{url_dir}/?{url_next}'
yield prices
if __name__ == '__main__':
prices = None
try:
prices = np.loadtxt('data.csv')
except:
prices = np.array(list(read_prices(URL_CATEGORY))).flatten()
if len(prices) == 0:
print("[ERROR] Could not download a single page. :(")
sys.exit(1)
print(prices)
np.savetxt('data.csv', prices, delimiter=',')
n, _, patches = plt.hist(prices, bins=int(prices.max()))
#n, bins, patches
patches[n.argmax()].set_fc('r')
plt.title('Histogramm über Preise von LED-Lampen (auf geizhals.de)')
plt.xlabel('Preis in €')
plt.ylabel('Anzahl')
plt.legend([Line2D([0], [0], color='r', lw=4)], ["Maximum"])
plt.savefig('plot.png')
import ipdb; ipdb.set_trace()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment