Skip to content

Instantly share code, notes, and snippets.

@wiiaboo
Created May 4, 2014 00:02
Show Gist options
  • Save wiiaboo/11508340 to your computer and use it in GitHub Desktop.
Save wiiaboo/11508340 to your computer and use it in GitHub Desktop.
# por cada scraper
# criar classe correspondente
# cada scraper tem função get_price que diz o preço actual
# cada scraper tem timer, tem de se esperar X segundos por cada request para evitar quebrar scrapes
# cada scraper só se inicia uma vez por sessão
# criar scrapers para kuantokusta, alientech, moddingworld, chiptec, jifortec, pcdiga,
# usar time.sleep()
# import google
from requests import get
from lxml.html import fromstring
from urllib.parse import urlparse
headers = {'User-Agent': 'Mozilla/5.0'}
formats = {'www.kuantokusta.pt': "tree.xpath('//span[@itemprop=\"lowPrice\"]/text()')[0]",
'www.alientech.pt': "tree.xpath('//span[@class=\"productSpecialPrice_prd\"]/text()')[0][:-1]",
'moddingworld.pt': "tree.xpath('//span[@id=\"our_price_display\"]/text()')[0][:-2]",
'www.chiptec.net': "tree.xpath('//span[@itemprop=\"price\"]/text()')[0][:-2]",
'www.jifortec.pt': "tree.xpath('//span[@class=\"price\"]/text()')[0][:-2]",
'www.pcdiga.com': "tree.xpath('//span[@id=\"preco_grelha\"]/text()')[0][:-2]",
'www.nanochip.pt': "tree.xpath('//span[@id=\"price\"]/text()')[0].strip()[:-2]"}
urls = ['http://www.kuantokusta.pt/1/309637/Gigabyte-GeForce-GTX760-OC-Windforce-3X-2GB-GDDR5-PCI-E-GV-N760OC-2GD',
'http://www.nanochip.pt/pt-PT/p/47848/ASROCK-AM1B-M-Socket-AM1_ASROCKAM1BM.htm',
'http://moddingworld.pt/loja/index.php?controller=product&id_product=43745',
'http://www.alientech.pt/product_info.php?products_id=23902',
'http://www.jifortec.pt/index.php/dimm-kingston-hyperx-blu-ddr3-2gb-1600mhz-khx1600c9ad3b1-2g.html',
'http://www.chiptec.net/componentes-para-computadores/motherboards/motherboards-com-processador/gigabyte-ga-am1m-s2h-socket-am1.html',
'http://www.pcdiga.com/2/8892/Caixa-SilverStone-Raven-2-Preta-SST-RV02B-EW-USB3-0']
for url in urls:
page = get(url, headers=headers)
tree = fromstring(page.text)
parsed = urlparse(url).netloc
if parsed in formats.keys():
price = round(float(eval(formats[parsed]).replace(',','.'))+0.2)
else:
price = "N/A"
print('{}\t->\t{}'.format(price, url[:40]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment