Created
December 24, 2020 20:51
-
-
Save jitsejan/6df3a7db678aa2588e0f5d6d0660e8be to your computer and use it in GitHub Desktop.
Simple example on how to use xpaths to get data from a website.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
import lxml.html | |
import requests | |
from typing import Iterator | |
BASE_URL = "https://zelda.gamepedia.com" | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0' | |
} | |
@dataclass(frozen=True) | |
class Item: | |
name: str | |
price: int | |
def __repr__(self): | |
return (f'{self.__class__.__name__}' | |
f'(name={self.name}, price={self.price})') | |
def _get_tree_from_url(url: str) -> lxml.html.etree: | |
resp = session.get(url) | |
return lxml.html.fromstring(resp.text) | |
def get_item_links() -> Iterator[str]: | |
items_url = f"{BASE_URL}/Items_in_The_Legend_of_Zelda" | |
tree = _get_tree_from_url(items_url) | |
for elem in tree.cssselect("li.gallerybox .gallerytext p a"): | |
yield f"{BASE_URL}{elem.attrib['href'].split('#')[0]}" | |
def get_item_details(link: str) -> Item: | |
tree = _get_tree_from_url(link) | |
try: | |
name = tree.cssselect("meta[property='og:title']")[0].attrib['content'] | |
price = int(tree.xpath("//tr[th//text()[contains(., 'Cost(s)')]]/td/div")[0].text) | |
return Item(name, price) | |
except: | |
pass # No price for this item | |
session = requests.Session() | |
session.headers = HEADERS | |
items = [] | |
for link in get_item_links(): | |
item_data = get_item_details(link) | |
(items.append(item_data) if item_data is not None else None) | |
items.sort(key=lambda x: x.price, reverse=True) | |
print(items) | |
# [Item(name=Bow, price=980), Item(name=Boomerang, price=300), Item(name=Blue Ring, price=250), Item(name=Arrow, price=80), Item(name=Red Water of Life, price=68), Item(name=Blue Candle, price=60), Item(name=Food, price=60), Item(name=Blue Water of Life, price=40), Item(name=Bomb, price=20), Item(name=Heart Container, price=4)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment