Skip to content

Instantly share code, notes, and snippets.

@cheeyeo
Created September 21, 2024 15:47
Show Gist options
  • Save cheeyeo/3ed5f6191a3d7e6ff6f7ec899b1a5c44 to your computer and use it in GitHub Desktop.
Save cheeyeo/3ed5f6191a3d7e6ff6f7ec899b1a5c44 to your computer and use it in GitHub Desktop.
Example of Template Method design pattern
from abc import ABC, abstractmethod
import re
from typing import Dict, Any
import json
import requests
from bs4 import BeautifulSoup
class Scraper(ABC):
def parse_site(self, url: str, headers: dict={}) -> Dict[str, Any]:
self.res = {}
self.get_html(url, headers)
self.parse_ld_tags()
if len(self.res) < 1:
self.parse_meta_tags()
# can be overridden in subclasses
self.parse_custom()
return self.res
def get_html(self, url: str, headers: dict={}) -> None:
try:
req = requests.get(url, headers=headers)
req.raise_for_status()
self.html_body = req.text
self.soup_obj = BeautifulSoup(req.text, "lxml")
except requests.exceptions.RequestException as e:
print(str(e))
def parse_ld_tags(self) -> None:
tags = self.soup_obj.find_all("script", type="application/ld+json")
if len(tags) > 0:
print('Processing the ld tags and filling res dict...')
site_json = [x for x in tags if json.loads(x.text)["@graph"][0]["@type"] == "WebPage"][0]
site_json = json.loads(site_json.text)
self.res["title"] = f"TITLE FROM LD TAGS - {site_json["@graph"][0]['name']}"
self.res["description"] = f"DESCRIPTION FROM LD TAGS - {site_json["@graph"][0]['description']}"
def parse_meta_tags(self) -> None:
metas = self.soup_obj.find_all("meta", property=re.compile("og:*"), content=True)
if len(metas) > 0:
print('Processing meta og tags...')
for meta in metas:
match meta.get("property"):
case "og:title":
name = meta.get("content")
self.res["title"] = f"TITLE FROM META TAGS - {name}"
case "og:description":
description = meta.get("content")
self.res["description"] = f"DESCRIPTION FROM META TAGS - {description}"
# Optional method to override to add custom scraping rules
def parse_custom(self) -> None:
pass
class CustomScraper(Scraper):
def parse_custom(self) -> None:
# Look for title tags and meta description tags only
title = self.soup_obj.find("title")
description = self.soup_obj.find("meta", attrs={'name':'description'})
self.res["title"] = f"TITLE FROM PARSE CUSTOM - {title.text}"
self.res["description"] = f"DESCRIPTION FROM PARSE CUSTOM - {description['content']}"
if __name__ == "__main__":
url = "https://www.nil.com"
scraper1 = Scraper().parse_site(url)
print(scraper1)
scraper2 = CustomScraper().parse_site(url)
print(scraper2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment