Skip to content

Instantly share code, notes, and snippets.

@carpedm20
Created September 30, 2025 08:00
Show Gist options
  • Save carpedm20/95250066bef8c544eb23cc924220abef to your computer and use it in GitHub Desktop.
Save carpedm20/95250066bef8c544eb23cc924220abef to your computer and use it in GitHub Desktop.
import hashlib
import json
import os
import time
from concurrent.futures import Future, ThreadPoolExecutor
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional
import requests
class SsenseError(RuntimeError):
pass
IMAGE_CONVERT_QUERY = os.getenv("IMAGE_CONVERT_QUERY", "f_jpg,c_limit,h_1024,w_1024")
BASE_URL = "https://www.ssense.com/en-us"
SCRAPINGBEE_API_KEY = os.getenv("SCRAPINGBEE_API_KEY")
assert SCRAPINGBEE_API_KEY, "SCRAPINGBEE_API_KEY is not set"
# Global thread pool
NUM_WORKERS = int(os.getenv("SSENSE_NUM_WORKERS", "5"))
_thread_pool = ThreadPoolExecutor(max_workers=NUM_WORKERS)
# Cache directory
CACHE_DIR = Path(os.getenv("SSENSE_CACHE_DIR", ".cache/ssense"))
CACHE_DIR.mkdir(parents=True, exist_ok=True)
def _get_cache_key(url: str) -> str:
"""Generate a cache key from URL"""
return hashlib.md5(url.encode()).hexdigest()
def _get_cache_path(url: str) -> Path:
"""Get cache file path for URL"""
cache_key = _get_cache_key(url)
cache_sub_dir = CACHE_DIR / cache_key[:2]
cache_sub_dir.mkdir(parents=True, exist_ok=True)
return cache_sub_dir / f"{cache_key}.json"
def _load_from_cache(url: str) -> Optional[Dict[str, Any]]:
"""Load response from cache if exists and not expired"""
cache_path = _get_cache_path(url)
if cache_path.exists():
try:
# Check if cache is not expired (24 hours) using file modification time
if time.time() - cache_path.stat().st_mtime < 86400 * 7: # 7 days
with open(cache_path, "r") as f:
return json.load(f)
except Exception:
pass
return None
def _save_to_cache(url: str, data: Dict[str, Any]) -> None:
"""Save response to cache"""
cache_path = _get_cache_path(url)
try:
with open(cache_path, "w") as f:
json.dump(data, f)
except Exception:
pass
def _make_request(url: str, timeout: int = 30, retries: int = 1) -> Dict[str, Any]:
"""Make HTTP request with caching"""
# Check cache first
cached_data = _load_from_cache(url)
if cached_data is not None:
return cached_data
resp = None
for i in range(retries):
resp = requests.get(
url="https://app.scrapingbee.com/api/v1",
params={
"api_key": SCRAPINGBEE_API_KEY,
"url": url,
"render_js": "false",
},
timeout=timeout,
)
if resp.status_code != 200:
if i < retries - 1:
print(f"Failed to get {url}: {resp.status_code} -> retrying... ({i + 1}/{retries})")
continue
break
if resp is None:
raise SsenseError(f"Failed to get {url}: No response")
elif resp.status_code != 200:
if resp.status_code == 500 and "px-captcha" in resp.text:
raise SsenseError(f"Failed to get {url}: Captcha detected")
else:
raise SsenseError(f"Failed to get {url}: {resp.status_code}\n{resp.text}")
data = resp.json()
# Save to cache
_save_to_cache(url, data)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment