Created
September 30, 2025 08:00
-
-
Save carpedm20/95250066bef8c544eb23cc924220abef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import hashlib | |
| import json | |
| import os | |
| import time | |
| from concurrent.futures import Future, ThreadPoolExecutor | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Literal, Optional | |
| import requests | |
| class SsenseError(RuntimeError): | |
| pass | |
| IMAGE_CONVERT_QUERY = os.getenv("IMAGE_CONVERT_QUERY", "f_jpg,c_limit,h_1024,w_1024") | |
| BASE_URL = "https://www.ssense.com/en-us" | |
| SCRAPINGBEE_API_KEY = os.getenv("SCRAPINGBEE_API_KEY") | |
| assert SCRAPINGBEE_API_KEY, "SCRAPINGBEE_API_KEY is not set" | |
| # Global thread pool | |
| NUM_WORKERS = int(os.getenv("SSENSE_NUM_WORKERS", "5")) | |
| _thread_pool = ThreadPoolExecutor(max_workers=NUM_WORKERS) | |
| # Cache directory | |
| CACHE_DIR = Path(os.getenv("SSENSE_CACHE_DIR", ".cache/ssense")) | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| def _get_cache_key(url: str) -> str: | |
| """Generate a cache key from URL""" | |
| return hashlib.md5(url.encode()).hexdigest() | |
| def _get_cache_path(url: str) -> Path: | |
| """Get cache file path for URL""" | |
| cache_key = _get_cache_key(url) | |
| cache_sub_dir = CACHE_DIR / cache_key[:2] | |
| cache_sub_dir.mkdir(parents=True, exist_ok=True) | |
| return cache_sub_dir / f"{cache_key}.json" | |
| def _load_from_cache(url: str) -> Optional[Dict[str, Any]]: | |
| """Load response from cache if exists and not expired""" | |
| cache_path = _get_cache_path(url) | |
| if cache_path.exists(): | |
| try: | |
| # Check if cache is not expired (24 hours) using file modification time | |
| if time.time() - cache_path.stat().st_mtime < 86400 * 7: # 7 days | |
| with open(cache_path, "r") as f: | |
| return json.load(f) | |
| except Exception: | |
| pass | |
| return None | |
| def _save_to_cache(url: str, data: Dict[str, Any]) -> None: | |
| """Save response to cache""" | |
| cache_path = _get_cache_path(url) | |
| try: | |
| with open(cache_path, "w") as f: | |
| json.dump(data, f) | |
| except Exception: | |
| pass | |
| def _make_request(url: str, timeout: int = 30, retries: int = 1) -> Dict[str, Any]: | |
| """Make HTTP request with caching""" | |
| # Check cache first | |
| cached_data = _load_from_cache(url) | |
| if cached_data is not None: | |
| return cached_data | |
| resp = None | |
| for i in range(retries): | |
| resp = requests.get( | |
| url="https://app.scrapingbee.com/api/v1", | |
| params={ | |
| "api_key": SCRAPINGBEE_API_KEY, | |
| "url": url, | |
| "render_js": "false", | |
| }, | |
| timeout=timeout, | |
| ) | |
| if resp.status_code != 200: | |
| if i < retries - 1: | |
| print(f"Failed to get {url}: {resp.status_code} -> retrying... ({i + 1}/{retries})") | |
| continue | |
| break | |
| if resp is None: | |
| raise SsenseError(f"Failed to get {url}: No response") | |
| elif resp.status_code != 200: | |
| if resp.status_code == 500 and "px-captcha" in resp.text: | |
| raise SsenseError(f"Failed to get {url}: Captcha detected") | |
| else: | |
| raise SsenseError(f"Failed to get {url}: {resp.status_code}\n{resp.text}") | |
| data = resp.json() | |
| # Save to cache | |
| _save_to_cache(url, data) | |
| return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment