Last active
March 26, 2025 07:21
-
-
Save aravindkarnam/da285d4e2af3a8d988640ad6aa2ece81 to your computer and use it in GitHub Desktop.
Crawl walmart site with Crawl4AI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from crawl4ai import ( | |
CrawlerRunConfig, | |
BrowserConfig, | |
AsyncWebCrawler, | |
JsonCssExtractionStrategy, | |
LLMConfig, | |
) | |
from crawl4ai.utils import preprocess_html_for_schema | |
import asyncio | |
browser_config = BrowserConfig(headless=False) | |
run_config = CrawlerRunConfig( | |
css_selector="html > body > div > div:first-of-type > div > div:nth-of-type(2) > div:first-of-type > main > div > div:nth-of-type(2) > div > div > div:first-of-type > div:nth-of-type(2) > div > section", | |
scan_full_page=True, | |
scroll_delay=1, | |
magic=True, | |
) | |
async def getSchema(): | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
"https://www.walmart.com/search?q=eggs", config=run_config | |
) | |
processed_html = preprocess_html_for_schema(result.html) | |
schema = JsonCssExtractionStrategy.generate_schema( | |
processed_html, | |
llm_config=LLMConfig( | |
provider="gemini/gemini-2.0-flash", api_token="env:GEMINI_API_KEY" | |
), | |
target_json_example={ | |
"products": [ | |
{ | |
"name": "Great Value Cage-Free Large White Eggs, 18 Count", | |
"price": 10.52, | |
"image": "https://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-18-Count_49bf5868-ea32-4e78-86a8-dff1a9a747c0.cd43f050a9f5ef262bd6ec054a233deb.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF", | |
"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-18-Count/374077316?classType=REGULAR&from=/search", | |
}, | |
{ | |
"name": "Great Value Cage-Free Large White Eggs, 12 Count", | |
"price": 7.12, | |
"image": "hhttps://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-12-Count_22b4f531-750e-41ed-9057-079dc022c618.62b239344db5102e607637e1048a53b9.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF", | |
"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-12-Count/421705528?classType=REGULAR&from=/search", | |
}, | |
], | |
}, | |
) | |
return schema | |
async def get_results(schema): | |
extraction_strategy = JsonCssExtractionStrategy(schema=schema) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
product = input("Enter the name of product you want to search:") | |
result = await crawler.arun( | |
f"https://www.walmart.com/search?q={product}", | |
config=run_config.clone(extraction_strategy=extraction_strategy), | |
) | |
print(result.extracted_content) | |
async def main(): | |
# Generate schema once per page type (results page, product page, etc.) then save and reuse it | |
# schema = await getSchema() | |
# print(schema) | |
schema = { | |
"name": "Walmart Product List", | |
"baseSelector": "div[class^='mb0 ph0-xl pt0-xl bb b--near-white w-25 pb3-m ph1']", | |
"fields": [ | |
{ | |
"name": "products", | |
"type": "list", | |
"selector": "div[class^='h-100 pr4-xl pt3']", | |
"fields": [ | |
{ | |
"name": "name", | |
"selector": "span[data-automation-id='product-title']", | |
"type": "text", | |
}, | |
{ | |
"name": "price", | |
"selector": "div[data-automation-id='product-price'] .f2", | |
"type": "text", | |
}, | |
{ | |
"name": "image", | |
"selector": "img[data-testid='productTileImage']", | |
"type": "attribute", | |
"attribute": "src", | |
}, | |
{ | |
"name": "url", | |
"selector": "a", | |
"type": "attribute", | |
"attribute": "href", | |
}, | |
], | |
} | |
], | |
} | |
await get_results(schema) | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment