Skip to content

Instantly share code, notes, and snippets.

@adorum
Created September 3, 2019 22:04
Show Gist options
  • Save adorum/fa1bb4e51d7108f79ad08f2fbb36f34c to your computer and use it in GitHub Desktop.
Save adorum/fa1bb4e51d7108f79ad08f2fbb36f34c to your computer and use it in GitHub Desktop.
Drinkshop
from lxml import html, etree as ET
import unicodedata
from io import StringIO
import requests
import csv
def normalize_key(key):
output = unicodedata.normalize('NFKD', key).encode('ASCII', 'ignore')
return output.title().decode("utf-8").lower().replace(' ', '_')
def parse_url(link):
page = requests.get(link)
tree = html.fromstring(page.content)
title = tree.xpath('//h2[@class="flypage-h1"]/text()[1]')[0]
description = tree.xpath('//div[@class="tab-page"]/hr/following-sibling::p[1]/text()')[0].replace('"', "'")
data = [x.replace(u'\xa0', u' ').strip(', .') for x in tree.xpath(
'//div[@class="tab-page"]/descendant::*/text()')]
data_final = [x for x in data if x.strip()]
json_dict = {'meno': title, 'druh_piva': '', 'poddruh_piva': '', 'vyrobca': '', 'farba': '',
'objem': '', 'obsah_alkoholu': '', 'stupnovitost': '', 'popis': description, 'zlozenie': '', 'image': ''}
for index, x in enumerate(data_final, start=1):
prop = x.split(":")
if len(prop) == 2:
json_dict[normalize_key(prop[0].strip())] = ','.join(
prop[1:]).strip()
if prop[0] == 'Výrobca':
json_dict[normalize_key(prop[0].strip())] = ', '.join(
data_final[index:index + 2])
if prop[0] == 'Zloženie':
json_dict[normalize_key(prop[0].strip())] = ', '.join(
[prop[1].strip()] + data_final[index:])
return json_dict
def export_to_csv(data):
keys = ['dataset']
for x in data[0].keys():
keys.append(x)
keys.append(f'{x}2')
result = []
for index, x in enumerate(data):
if (index % 2 == 0):
result.append(x)
else:
last = result[-1]
for key, val in x.items():
last[f'{key}2'] = val
result[-1] = last
with open('output.csv', 'w', newline='') as output:
dict_writer = csv.DictWriter(output, fieldnames=keys)
dict_writer.writeheader()
for index, row in enumerate(result):
row['dataset'] = index + 1
dict_writer.writerow(row)
urls = [
'https://www.drinkshop.sk/pivo/lucky-bastard-cherry-pussy/',
'https://www.drinkshop.sk/pivo/van-moll-langharig-tuig/',
'https://www.drinkshop.sk/pivo/bevog-punk-rock-hazy-holiday-can/',
'https://www.drinkshop.sk/pivo/pinta-bdpf-krol-lata-le-roi-de-lete/',
'https://www.drinkshop.sk/pivo/general-grunt/',
'https://www.drinkshop.sk/pivo/omnipollo-perikles/',
'https://www.drinkshop.sk/pivo/oakham-citra/',
'https://www.drinkshop.sk/pivo/pinta-viva-la-wita/',
]
result = []
for url in urls:
try:
data = parse_url(url)
if not len(data):
print(url)
result.append(data)
except Exception as err:
print(err)
print(url)
print(result)
export_to_csv(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment