aravindkarnam · March 26, 2025 07:21
diff --git a/walmart.py b/walmart.py
 from crawl4ai import (
    CrawlerRunConfig,
    BrowserConfig,
    AsyncWebCrawler,
    JsonCssExtractionStrategy,
    LLMConfig,
 )
 from crawl4ai.utils import preprocess_html_for_schema
 import asyncio

 browser_config = BrowserConfig(headless=False)
 run_config = CrawlerRunConfig(
    css_selector="html > body > div > div:first-of-type > div > div:nth-of-type(2) > div:first-of-type > main > div > div:nth-of-type(2) > div > div > div:first-of-type > div:nth-of-type(2) > div > section",
    scan_full_page=True,
    scroll_delay=1,
    magic=True,
 )


 async def getSchema():
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            "https://www.walmart.com/search?q=eggs", config=run_config
        )
        processed_html = preprocess_html_for_schema(result.html)
        schema = JsonCssExtractionStrategy.generate_schema(
            processed_html,
            llm_config=LLMConfig(
                provider="gemini/gemini-2.0-flash", api_token="env:GEMINI_API_KEY"
            ),
            target_json_example={
                "products": [
                    {
                        "name": "Great Value Cage-Free Large White Eggs, 18 Count",
                        "price": 10.52,
                        "image": "https://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-18-Count_49bf5868-ea32-4e78-86a8-dff1a9a747c0.cd43f050a9f5ef262bd6ec054a233deb.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
                        "url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-18-Count/374077316?classType=REGULAR&from=/search",
                    },
                    {
                        "name": "Great Value Cage-Free Large White Eggs, 12 Count",
                        "price": 7.12,
                        "image": "hhttps://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-12-Count_22b4f531-750e-41ed-9057-079dc022c618.62b239344db5102e607637e1048a53b9.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
                        "url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-12-Count/421705528?classType=REGULAR&from=/search",
                    },
                ],
            },
        )
        return schema


 async def get_results(schema):
    extraction_strategy = JsonCssExtractionStrategy(schema=schema)
    async with AsyncWebCrawler(config=browser_config) as crawler:
        product = input("Enter the name of product you want to search:")
        result = await crawler.arun(
            f"https://www.walmart.com/search?q={product}",
            config=run_config.clone(extraction_strategy=extraction_strategy),
        )
        print(result.extracted_content)


 async def main():
    # Generate schema once per page type (results page, product page, etc.) then save and reuse it
    # schema = await getSchema()
    # print(schema)
    schema = {
        "name": "Walmart Product List",
        "baseSelector": "div[class^='mb0 ph0-xl pt0-xl bb b--near-white w-25 pb3-m ph1']",
        "fields": [
            {
                "name": "products",
                "type": "list",
                "selector": "div[class^='h-100 pr4-xl pt3']",
                "fields": [
                    {
                        "name": "name",
                        "selector": "span[data-automation-id='product-title']",
                        "type": "text",
                    },
                    {
                        "name": "price",
                        "selector": "div[data-automation-id='product-price'] .f2",
                        "type": "text",
                    },
                    {
                        "name": "image",
                        "selector": "img[data-testid='productTileImage']",
                        "type": "attribute",
                        "attribute": "src",
                    },
                    {
                        "name": "url",
                        "selector": "a",
                        "type": "attribute",
                        "attribute": "href",
                    },
                ],
            }
        ],
    }
    await get_results(schema)


 asyncio.run(main())
	from crawl4ai import (
	CrawlerRunConfig,
	BrowserConfig,
	AsyncWebCrawler,
	JsonCssExtractionStrategy,
	LLMConfig,
	)
	from crawl4ai.utils import preprocess_html_for_schema
	import asyncio

	browser_config = BrowserConfig(headless=False)
	run_config = CrawlerRunConfig(
	css_selector="html > body > div > div:first-of-type > div > div:nth-of-type(2) > div:first-of-type > main > div > div:nth-of-type(2) > div > div > div:first-of-type > div:nth-of-type(2) > div > section",
	scan_full_page=True,
	scroll_delay=1,
	magic=True,
	)


	async def getSchema():
	async with AsyncWebCrawler(config=browser_config) as crawler:
	result = await crawler.arun(
	"https://www.walmart.com/search?q=eggs", config=run_config
	)
	processed_html = preprocess_html_for_schema(result.html)
	schema = JsonCssExtractionStrategy.generate_schema(
	processed_html,
	llm_config=LLMConfig(
	provider="gemini/gemini-2.0-flash", api_token="env:GEMINI_API_KEY"
	),
	target_json_example={
	"products": [
	{
	"name": "Great Value Cage-Free Large White Eggs, 18 Count",
	"price": 10.52,
	"image": "https://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-18-Count_49bf5868-ea32-4e78-86a8-dff1a9a747c0.cd43f050a9f5ef262bd6ec054a233deb.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
	"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-18-Count/374077316?classType=REGULAR&from=/search",
	},
	{
	"name": "Great Value Cage-Free Large White Eggs, 12 Count",
	"price": 7.12,
	"image": "hhttps://i5.walmartimages.com/seo/Great-Value-Cage-Free-Large-White-Eggs-12-Count_22b4f531-750e-41ed-9057-079dc022c618.62b239344db5102e607637e1048a53b9.jpeg?odnHeight=580&odnWidth=580&odnBg=FFFFFF",
	"url": "/ip/Great-Value-Cage-Free-Large-White-Eggs-12-Count/421705528?classType=REGULAR&from=/search",
	},
	],
	},
	)
	return schema


	async def get_results(schema):
	extraction_strategy = JsonCssExtractionStrategy(schema=schema)
	async with AsyncWebCrawler(config=browser_config) as crawler:
	product = input("Enter the name of product you want to search:")
	result = await crawler.arun(
	f"https://www.walmart.com/search?q={product}",
	config=run_config.clone(extraction_strategy=extraction_strategy),
	)
	print(result.extracted_content)


	async def main():
	# Generate schema once per page type (results page, product page, etc.) then save and reuse it
	# schema = await getSchema()
	# print(schema)
	schema = {
	"name": "Walmart Product List",
	"baseSelector": "div[class^='mb0 ph0-xl pt0-xl bb b--near-white w-25 pb3-m ph1']",
	"fields": [
	{
	"name": "products",
	"type": "list",
	"selector": "div[class^='h-100 pr4-xl pt3']",
	"fields": [
	{
	"name": "name",
	"selector": "span[data-automation-id='product-title']",
	"type": "text",
	},
	{
	"name": "price",
	"selector": "div[data-automation-id='product-price'] .f2",
	"type": "text",
	},
	{
	"name": "image",
	"selector": "img[data-testid='productTileImage']",
	"type": "attribute",
	"attribute": "src",
	},
	{
	"name": "url",
	"selector": "a",
	"type": "attribute",
	"attribute": "href",
	},
	],
	}
	],
	}
	await get_results(schema)


	asyncio.run(main())