Skip to content

Instantly share code, notes, and snippets.

@aravindkarnam
Last active March 26, 2025 07:21
Show Gist options
  • Save aravindkarnam/da285d4e2af3a8d988640ad6aa2ece81 to your computer and use it in GitHub Desktop.
Save aravindkarnam/da285d4e2af3a8d988640ad6aa2ece81 to your computer and use it in GitHub Desktop.
Crawl walmart site with Crawl4AI
from crawl4ai import (
CrawlerRunConfig,
BrowserConfig,
AsyncWebCrawler,
JsonCssExtractionStrategy,
LLMConfig,
)
from crawl4ai.utils import preprocess_html_for_schema
import asyncio
browser_config = BrowserConfig(headless=False)
run_config = CrawlerRunConfig(
css_selector="html > body > div > div:first-of-type > div > div:nth-of-type(2) > div:first-of-type > main > div > div:nth-of-type(2) > div > div > div:first-of-type > div:nth-of-type(2) > div > section",
scan_full_page=True,
scroll_delay=1,
magic=True,
)
async def getSchema():
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
"https://www.walmart.com/search?q=eggs", config=run_config
)
processed_html = preprocess_html_for_schema(result.html)
schema = JsonCssExtractionStrategy.generate_schema(
processed_html,
llm_config=LLMConfig(
provider="gemini/gemini-2.0-flash", api_token="env:GEMINI_API_KEY"
),
target_json_example={
"products": [
{
"name": "Great Value Cage-Free Large White Eggs, 18 Count",
"price": 10.52,
"image": "https://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-18-Count_49bf5868-ea32-4e78-86a8-dff1a9a747c0.cd43f050a9f5ef262bd6ec054a233deb.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-18-Count/374077316?classType=REGULAR&from=/search",
},
{
"name": "Great Value Cage-Free Large White Eggs, 12 Count",
"price": 7.12,
"image": "hhttps://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-12-Count_22b4f531-750e-41ed-9057-079dc022c618.62b239344db5102e607637e1048a53b9.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-12-Count/421705528?classType=REGULAR&from=/search",
},
],
},
)
return schema
async def get_results(schema):
extraction_strategy = JsonCssExtractionStrategy(schema=schema)
async with AsyncWebCrawler(config=browser_config) as crawler:
product = input("Enter the name of product you want to search:")
result = await crawler.arun(
f"https://www.walmart.com/search?q={product}",
config=run_config.clone(extraction_strategy=extraction_strategy),
)
print(result.extracted_content)
async def main():
# Generate schema once per page type (results page, product page, etc.) then save and reuse it
# schema = await getSchema()
# print(schema)
schema = {
"name": "Walmart Product List",
"baseSelector": "div[class^='mb0 ph0-xl pt0-xl bb b--near-white w-25 pb3-m ph1']",
"fields": [
{
"name": "products",
"type": "list",
"selector": "div[class^='h-100 pr4-xl pt3']",
"fields": [
{
"name": "name",
"selector": "span[data-automation-id='product-title']",
"type": "text",
},
{
"name": "price",
"selector": "div[data-automation-id='product-price'] .f2",
"type": "text",
},
{
"name": "image",
"selector": "img[data-testid='productTileImage']",
"type": "attribute",
"attribute": "src",
},
{
"name": "url",
"selector": "a",
"type": "attribute",
"attribute": "href",
},
],
}
],
}
await get_results(schema)
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment