Created
June 19, 2025 06:29
-
-
Save 50-Course/9a3fb4bc6d5f88af04a964e3f11c6ea8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import logging | |
import os | |
import random | |
import time | |
from contextlib import asynccontextmanager | |
from datetime import datetime | |
from pathlib import Path | |
from typing import Annotated, Any, Callable, Dict, List, Optional | |
from bs4 import BeautifulSoup | |
from openpyxl import Workbook | |
from openpyxl.utils import get_column_letter | |
from playwright.async_api import Browser, BrowserContext | |
from playwright.async_api import Error as PlaywrightError | |
from playwright.async_api import Locator, Page | |
from playwright.async_api import TimeoutError as PlaywrightTimeoutError | |
from playwright.async_api import async_playwright | |
from playwright_stealth import stealth_async | |
from src.scrapper.scrape_product_data_async import extract_product_data_async | |
from src.scrapper.scrape_product_tiles_async import \ | |
scrape_product_overview_tiles | |
from .constants import (SELECTOR_CATEGORY_ITEM, SELECTOR_CATEGORY_LABEL, | |
SELECTOR_CATEGORY_LABEL_SAFE, | |
SELECTOR_HOMEPAGE_PRODUCTS_COLUMN, | |
SELECTOR_INDEX_ENTRY_IMAGE, SELECTOR_INDEX_ENTRY_ITEM, | |
SELECTOR_INDEX_ENTRY_LINK, SELECTOR_INDEX_ENTRY_TITLE, | |
SELECTOR_INDEX_LIST_CONTAINER, | |
SELECTOR_INDEX_PAGE_HEADER, | |
SELECTOR_PRODUCTS_INNERMOST_CONTAINER, | |
SELECTOR_SUBCATEGORY_LINK, | |
SELECTOR_SUBCATEGORY_LINK_SAFE) | |
from .constants import _ResponseData as Response | |
from .utils import (browser_context, extract_product_link_from_tile, | |
fallback_locator, get_random_user_agent, goto_with_retry, | |
human_delay, is_valid_product_page, retry_with_backoff, | |
write_category_to_excel) | |
logger = logging.getLogger(__name__) | |
url: str = "https://www.medicalexpo.com/" | |
async def scrape_url( | |
url: str, | |
headless: bool = False, | |
debug: bool = False, | |
slow_mo: int = 40, | |
wait_for_load: int = 3000, | |
to_excel: bool = False, | |
output_dir: Path | None = None, | |
send_notification: bool = False, | |
) -> None: | |
try: | |
async with browser_context( | |
headless=headless, | |
user_agent=get_random_user_agent(), | |
bypass_csp=True, | |
) as ctx: | |
page = await ctx.new_page() | |
await stealth_async(page) | |
await retry_with_backoff(lambda: page.goto(url, wait_until="networkidle")) | |
if debug and wait_for_load > 0: | |
await page.wait_for_timeout(wait_for_load) | |
print("Checking for page response") | |
parent_container_visble = await page.is_visible( | |
SELECTOR_HOMEPAGE_PRODUCTS_COLUMN | |
) | |
if parent_container_visble: | |
print("[INFO] Parent Container is Visible") | |
await entrypoint(page, to_excel=to_excel) | |
except (PlaywrightError, TimeoutError) as play_err: | |
logger.exception(f"Error scraping URL: {play_err}") | |
async def scrape_all_subcategory_indexes(ctx: BrowserContext, categories): | |
sem = asyncio.Semaphore(8) | |
jobs = [] | |
for section in categories: | |
for sub in section["subcategories"]: | |
async def scrape_subcategory(name=sub["name"], url=sub["url"], storage=sub): | |
async with sem: | |
page = await ctx.new_page() | |
try: | |
await scrape_product_listing_index( | |
page, name, url, storage_=storage | |
) | |
except Exception as e: | |
print(f"[ERROR] Failed scraping {name}: {e}") | |
finally: | |
await page.close() | |
jobs.append(scrape_subcategory()) | |
await asyncio.gather(*jobs) | |
async def scrape_product_listing_index( | |
page: Page, | |
subcategory_name: str, | |
subcategory_url: str, | |
storage_: Optional[Response] = None, | |
) -> None: | |
print(f"[INFO] Navigating to subcategory page: {subcategory_url}") | |
await retry_with_backoff(lambda: page.goto(subcategory_url)) | |
await page.wait_for_selector(SELECTOR_INDEX_PAGE_HEADER) | |
page_heading = await ( | |
await page.query_selector(SELECTOR_INDEX_PAGE_HEADER) | |
).inner_text() | |
if page_heading.lower() != subcategory_name.lower(): | |
print( | |
f"[WARN] Page mismatch: Expected '{subcategory_name}', got '{page_heading}'" | |
) | |
return | |
# Wait for parent container | |
await page.wait_for_selector("div#category-group ul.category-grouplist") | |
group_nodes = await page.query_selector_all( | |
"div#category-group ul.category-grouplist" | |
) | |
index_entries = [] | |
for group in group_nodes: | |
item_nodes = await group.query_selector_all("li") | |
for item in item_nodes: | |
a_tag = await item.query_selector("a") | |
img_tag = await item.query_selector("div.imgSubCat img") | |
if not a_tag: | |
continue | |
name = (await a_tag.inner_text()).strip() | |
href = await a_tag.get_attribute("href") | |
img_src = await img_tag.get_attribute("src") if img_tag else "" | |
img_alt = await img_tag.get_attribute("alt") if img_tag else "" | |
index_entries.append( | |
{ | |
"title": name, | |
"href": href, | |
"image_meta": { | |
"src": img_src, | |
"alt": img_alt, | |
}, | |
} | |
) | |
print(index_entries) | |
if storage_ is not None: | |
storage_["index_entries"] = index_entries | |
print( | |
f"[INFO] Extracted {len(index_entries)} index entries from '{subcategory_name}'" | |
) | |
async def extract_categories( | |
page: Page, logger_func: Optional[Callable[[str], None]] = None | |
): | |
logger_func = logger_func or print | |
logger_func("[*] Looking for top-level category items...") | |
section_items = await fallback_locator( | |
page, | |
[ | |
"li[data-cy^='universGroupItemCy_']", | |
SELECTOR_CATEGORY_ITEM, | |
], | |
) | |
section_items = await section_items.all() | |
logger_func(f"[+] Found {len(section_items)} top-level category items") | |
categories: List[Dict[str, Any]] = [] | |
for i, section in enumerate(section_items): | |
logger_func(f"\n[→] Processing category index {i}") | |
try: | |
label_node = await fallback_locator( | |
page, | |
scope=section, | |
selectors=[ | |
":scope span[class*='UniverseGroupLabel']", | |
":scope span[class*='universeGroup__UniverseGroupLabel']", | |
":scope span", | |
], | |
) | |
print(f"[INFO] {label_node}") | |
category_name = (await label_node.inner_text()).strip() | |
logger_func(f" [✓] Category name: '{category_name}'") | |
except Exception as e: | |
logger_func(f" [!] Failed to extract category name: {e}") | |
continue | |
# expand dropdown | |
try: | |
# wait 5 secs | |
await section.wait_for(timeout=5000) | |
await section.click(timeout=2000) | |
await human_delay(0.2) | |
logger_func(" [✓] Clicked to expand dropdown") | |
except Exception as e: | |
logger_func(f" [!] Failed to expand category '{category_name}': {e}") | |
subsections = await section.locator("ul li a").all() | |
logger_func(f"[→] Section: {category_name} ({len(subsections)} subcategories)") | |
subcategories = [] | |
for subsection in subsections: | |
try: | |
name = (await subsection.inner_text()).strip() | |
href = await subsection.get_attribute("href") | |
if name and href: | |
subcategories.append({"name": name, "url": href}) | |
logger_func(f" [✓] Subsection: {name}") | |
except Exception as e: | |
logger_func(f" [!] Failed to extract subsection link: {e}") | |
categories.append( | |
{ | |
"section": category_name, | |
"subcategories": subcategories, | |
} | |
) | |
logger_func( | |
f"[→] Completed Section: {category_name} ({len(subsections)} subcategories)" | |
) | |
logger_func("\n[✓] Completed extracting all categories.") | |
return categories | |
async def extract_categories_from_homepage( | |
page: Page, storage_: Optional[Response] = None | |
): | |
print("[INFO] Entered inside the function: extract_categories_from_homepage") | |
try: | |
await page.wait_for_selector( | |
SELECTOR_PRODUCTS_INNERMOST_CONTAINER, state="attached", timeout=15000 | |
) | |
print("[INFO] Selector attached to DOM") | |
container = page.locator(SELECTOR_PRODUCTS_INNERMOST_CONTAINER) | |
is_visible = await container.is_visible() | |
print(f"[INFO] Container visibility: {is_visible}") | |
if not is_visible: | |
print("[INFO] Element is attached but not visible") | |
return | |
print("[INFO] Element is attached AND visible. Proceeding.") | |
except (PlaywrightTimeoutError, Exception): | |
print("[ERROR] Innermost container never appeared in DOM") | |
return | |
try: | |
categories = await extract_categories(page) | |
except Exception as e: | |
print(f"[ERROR] Failed to extract categories: {e}") | |
return | |
if storage_: | |
storage_["categories"] = categories | |
print(f"[INFO] Extracted {len(categories)} top-level sections.") | |
return categories | |
async def scrape_product_overview( | |
ctx: BrowserContext, | |
categories: List[Dict[str, Any]], | |
logger_func: Optional[Callable] = None, | |
): | |
logger_func = logger_func or print | |
sem = asyncio.Semaphore(5) | |
entries_to_scrape = [ | |
entry | |
for section in categories | |
for sub in section.get("subcategories", []) | |
for entry in sub.get("index_entries", []) | |
] | |
async def scrape_entry(entry): | |
async with sem: | |
page = await ctx.new_page() | |
try: | |
print(f"[->] Visiting product tile index page: {entry.get('href')}") | |
await page.goto( | |
entry["href"], timeout=60000, wait_until="domcontentloaded" | |
) | |
# operation 3: scrape all product tiles in this entry | |
tile_data = await scrape_product_overview_tiles(page) | |
# operation 4: for each product tile link, visit and extract full product data | |
full_product_details = [] | |
for tile in tile_data: | |
product_url = tile.get("product_link") | |
if not product_url: | |
continue | |
try: | |
print(f"[->->] Visiting product link: {product_url}") | |
await page.goto( | |
product_url, timeout=60000, wait_until="domcontentloaded" | |
) | |
# I have just discovered some product link causes redirect breaking | |
# our `extract_product_data_async` logic | |
# if not await is_valid_product_page( | |
# page, logger_func=logger_func | |
# ): | |
# logger_func( | |
# f"[WARN] Product page appears to be invalid, removed or moved permanently: {product_url}" | |
# ) | |
# logger_func( | |
# f"[SKIP] Soft 404 or placeholder page: {product_url}" | |
# ) | |
# continue | |
full_data = await extract_product_data_async(page) | |
full_product_details.append({**tile, **full_data}) | |
except Exception as e: | |
print( | |
f"[WARN] Failed to extract full product at {product_url}: {e}" | |
) | |
continue | |
entry["products"] = full_product_details | |
print(f"[✓] Completed scraping for index entry: {entry.get('title')}") | |
except Exception as e: | |
print( | |
f"[WARN] Could not scrape product detail for {entry.get('href')}: {e}" | |
) | |
finally: | |
await page.close() | |
await asyncio.gather(*(scrape_entry(entry) for entry in entries_to_scrape)) | |
print("[INFO] Completed all tile + full product detail extractions.") | |
async def entrypoint(page: Page, to_excel=False) -> None: | |
print("[INFO] Attempting to perform scrapping...") | |
scraped_data: Response = {} | |
# OPERATION 1 | |
categories = await extract_categories_from_homepage(page) | |
if categories: | |
scraped_data["categories"] = categories | |
print("[INFO] Completed Extract") | |
# Operation 2 | |
await scrape_all_subcategory_indexes(page.context, scraped_data["categories"]) | |
# OPERATION 3 + 4 | |
await scrape_product_overview(page.context, scraped_data["categories"]) | |
# print(f"[INFO] {scraped_data}") | |
print("[INFO] Successfully scraped website") | |
if to_excel and "categories" in scraped_data: | |
print("[DEBUG] Writing extracted categories to Excel file...") | |
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
write_category_to_excel( | |
scraped_data["categories"], filename=f"scraped_expo_data_{timestamp}.xlsx" | |
) | |
if __name__ == "__main__": | |
import argparse | |
import asyncio | |
parser = argparse.ArgumentParser(description="MedicalExpo Product Scraper") | |
parser.add_argument( | |
"--url", | |
type=str, | |
default="https://www.medicalexpo.com/", | |
help="Target URL to scrape from.", | |
) | |
parser.add_argument( | |
"--headless", | |
action="store_true", | |
help="Run browser in headless mode.", | |
) | |
parser.add_argument( | |
"--debug", | |
action="store_true", | |
help="Debug Mode", | |
) | |
parser.add_argument( | |
"--slow-mo", | |
type=int, | |
default=40, | |
help="Slow motion delay in ms between browser actions (default: 40).", | |
) | |
parser.add_argument( | |
"--wait-for-load", | |
type=int, | |
default=3000, | |
help="Wait time in ms after initial page load (default: 3000).", | |
) | |
parser.add_argument( | |
"--to-excel", | |
action="store_true", | |
help="Whether to write the result to Excel.", | |
) | |
parser.add_argument( | |
"--output-dir", | |
type=Path, | |
default=None, | |
help="Path to directory for saving output files.", | |
) | |
parser.add_argument( | |
"--notify", | |
action="store_true", | |
help="Send notification after scraping (e.g., Slack/Email/Whatsapp or Text).", | |
) | |
args = parser.parse_args() | |
asyncio.run(scrape_url(url, headless=args.headless, to_excel=args.to_excel)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment