aravindkarnam · February 12, 2025 07:58
diff --git a/.py b/.py
 import asyncio
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, BrowserConfig
 import json
 import os
 from playwright.async_api import Page, BrowserContext

 async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
        # Called before final HTML retrieval.
        print("[HOOK] before_retrieve_html - We can do final actions")
        # Example: Scroll again
        await page.evaluate("""
 (async () => {
    const element = document.querySelector('.fysCi.Vk3ZVd');
    let scrollAttempts = 0;
    const maxScrollAttempts = 5;
    
    while (document.querySelectorAll('.RHo1pe').length < 1000 && scrollAttempts < maxScrollAttempts) {
        const previousComments = document.querySelectorAll('.RHo1pe').length;
        element.scrollBy(0, 1500);
        await new Promise(resolve => setTimeout(resolve, 2000));  // Wait for 2 seconds
        const currentComments = document.querySelectorAll('.RHo1pe').length;
        if (currentComments <= previousComments) {
            scrollAttempts++;
        } else {
            scrollAttempts = 0;  // Reset the counter if new comments are loaded
        }
    }
 })();
 """)
        return page

 async def extract():
    # Define the extraction schema for comments
    schema = {
        "name": "Comment",
        "baseSelector": ".RHo1pe",
        "fields": [
            {"name": "TIME", "selector": "header.c1bOId > div.iXRFPc > span.bp9Aid", "type": "text"},
            {"name": "SUMMARY", "selector": "div.h3YV2d", "type": "text"},
        ]
    }
    
    # JavaScript commands
    js_commands = [
        "window.scrollTo(0, document.body.scrollHeight);",
        "await new Promise(resolve => setTimeout(resolve, 2000));",
        """
        document.evaluate(
            "//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-dgl2Hf ksBjEc lKxP2d LQeN7 aLey0c']//span[@class='VfPpkd-vQzf8d']",
            document,
            null,
            XPathResult.FIRST_ORDERED_NODE_TYPE,
            null
        ).singleNodeValue?.parentElement?.click();
        """,
    ]

    # Browser configuration with custom headers and settings
    browser_config = BrowserConfig(
        headless=False,
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }
    )

    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
    
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
            config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                extraction_strategy=extraction_strategy,
                scan_full_page=True,
                scroll_delay=3.0,
                js_code=js_commands,
                delay_before_return_html=3.0,
                page_timeout=1200000,  # Increase timeout to 120 seconds
                wait_until="networkidle"  # Wait until network is idle
            )
            
            result = await crawler.arun(
                url="https://play.google.com/store/apps/details?id=us.nobarriers.elsa&hl=vi",
                config=config
            )

            if not result.success:
                print(f"Crawl failed: {result.error_message}")
                return

            if not result.extracted_content:
                print("No content was extracted")
                return

            # Save the extracted content
            script_dir = os.path.dirname(os.path.abspath(__file__))
            output_file = os.path.join(script_dir, "auto_load_comments.json")
            
            articles = json.loads(result.extracted_content)
            with open(output_file, "w", encoding="utf-8") as json_file:
                json.dump(articles, json_file, ensure_ascii=False, indent=4)
            
            print(f"Data has been saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print(f"Error type: {type(e)}")

 if __name__ == "__main__":
    asyncio.run(extract())
	import asyncio
	from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
	from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, BrowserConfig
	import json
	import os
	from playwright.async_api import Page, BrowserContext

	async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
	# Called before final HTML retrieval.
	print("[HOOK] before_retrieve_html - We can do final actions")
	# Example: Scroll again
	await page.evaluate("""
	(async () => {
	const element = document.querySelector('.fysCi.Vk3ZVd');
	let scrollAttempts = 0;
	const maxScrollAttempts = 5;

	while (document.querySelectorAll('.RHo1pe').length < 1000 && scrollAttempts < maxScrollAttempts) {
	const previousComments = document.querySelectorAll('.RHo1pe').length;
	element.scrollBy(0, 1500);
	await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
	const currentComments = document.querySelectorAll('.RHo1pe').length;
	if (currentComments <= previousComments) {
	scrollAttempts++;
	} else {
	scrollAttempts = 0; // Reset the counter if new comments are loaded
	}
	}
	})();
	""")
	return page

	async def extract():
	# Define the extraction schema for comments
	schema = {
	"name": "Comment",
	"baseSelector": ".RHo1pe",
	"fields": [
	{"name": "TIME", "selector": "header.c1bOId > div.iXRFPc > span.bp9Aid", "type": "text"},
	{"name": "SUMMARY", "selector": "div.h3YV2d", "type": "text"},
	]
	}

	# JavaScript commands
	js_commands = [
	"window.scrollTo(0, document.body.scrollHeight);",
	"await new Promise(resolve => setTimeout(resolve, 2000));",
	"""
	document.evaluate(
	"//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-dgl2Hf ksBjEc lKxP2d LQeN7 aLey0c']//span[@class='VfPpkd-vQzf8d']",
	document,
	null,
	XPathResult.FIRST_ORDERED_NODE_TYPE,
	null
	).singleNodeValue?.parentElement?.click();
	""",
	]

	# Browser configuration with custom headers and settings
	browser_config = BrowserConfig(
	headless=False,
	headers={
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Accept-Encoding": "gzip, deflate, br",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}
	)

	extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

	try:
	async with AsyncWebCrawler(config=browser_config) as crawler:
	crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
	config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	extraction_strategy=extraction_strategy,
	scan_full_page=True,
	scroll_delay=3.0,
	js_code=js_commands,
	delay_before_return_html=3.0,
	page_timeout=1200000, # Increase timeout to 120 seconds
	wait_until="networkidle" # Wait until network is idle
	)

	result = await crawler.arun(
	url="https://play.google.com/store/apps/details?id=us.nobarriers.elsa&hl=vi",
	config=config
	)

	if not result.success:
	print(f"Crawl failed: {result.error_message}")
	return

	if not result.extracted_content:
	print("No content was extracted")
	return

	# Save the extracted content
	script_dir = os.path.dirname(os.path.abspath(__file__))
	output_file = os.path.join(script_dir, "auto_load_comments.json")

	articles = json.loads(result.extracted_content)
	with open(output_file, "w", encoding="utf-8") as json_file:
	json.dump(articles, json_file, ensure_ascii=False, indent=4)

	print(f"Data has been saved to {output_file}")

	except Exception as e:
	print(f"An error occurred: {str(e)}")
	print(f"Error type: {type(e)}")

	if __name__ == "__main__":
	asyncio.run(extract())