Skip to content

Instantly share code, notes, and snippets.

@aravindkarnam
Created February 12, 2025 07:58
Show Gist options
  • Save aravindkarnam/536461416de4f66ebd8e922ad6d2ff8e to your computer and use it in GitHub Desktop.
Save aravindkarnam/536461416de4f66ebd8e922ad6d2ff8e to your computer and use it in GitHub Desktop.
Scrape google playstore reviews with Crawl4AI
import asyncio
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, BrowserConfig
import json
import os
from playwright.async_api import Page, BrowserContext
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
# Called before final HTML retrieval.
print("[HOOK] before_retrieve_html - We can do final actions")
# Example: Scroll again
await page.evaluate("""
(async () => {
const element = document.querySelector('.fysCi.Vk3ZVd');
let scrollAttempts = 0;
const maxScrollAttempts = 5;
while (document.querySelectorAll('.RHo1pe').length < 1000 && scrollAttempts < maxScrollAttempts) {
const previousComments = document.querySelectorAll('.RHo1pe').length;
element.scrollBy(0, 1500);
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
const currentComments = document.querySelectorAll('.RHo1pe').length;
if (currentComments <= previousComments) {
scrollAttempts++;
} else {
scrollAttempts = 0; // Reset the counter if new comments are loaded
}
}
})();
""")
return page
async def extract():
# Define the extraction schema for comments
schema = {
"name": "Comment",
"baseSelector": ".RHo1pe",
"fields": [
{"name": "TIME", "selector": "header.c1bOId > div.iXRFPc > span.bp9Aid", "type": "text"},
{"name": "SUMMARY", "selector": "div.h3YV2d", "type": "text"},
]
}
# JavaScript commands
js_commands = [
"window.scrollTo(0, document.body.scrollHeight);",
"await new Promise(resolve => setTimeout(resolve, 2000));",
"""
document.evaluate(
"//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-dgl2Hf ksBjEc lKxP2d LQeN7 aLey0c']//span[@class='VfPpkd-vQzf8d']",
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
).singleNodeValue?.parentElement?.click();
""",
]
# Browser configuration with custom headers and settings
browser_config = BrowserConfig(
headless=False,
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
)
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction_strategy,
scan_full_page=True,
scroll_delay=3.0,
js_code=js_commands,
delay_before_return_html=3.0,
page_timeout=1200000, # Increase timeout to 120 seconds
wait_until="networkidle" # Wait until network is idle
)
result = await crawler.arun(
url="https://play.google.com/store/apps/details?id=us.nobarriers.elsa&hl=vi",
config=config
)
if not result.success:
print(f"Crawl failed: {result.error_message}")
return
if not result.extracted_content:
print("No content was extracted")
return
# Save the extracted content
script_dir = os.path.dirname(os.path.abspath(__file__))
output_file = os.path.join(script_dir, "auto_load_comments.json")
articles = json.loads(result.extracted_content)
with open(output_file, "w", encoding="utf-8") as json_file:
json.dump(articles, json_file, ensure_ascii=False, indent=4)
print(f"Data has been saved to {output_file}")
except Exception as e:
print(f"An error occurred: {str(e)}")
print(f"Error type: {type(e)}")
if __name__ == "__main__":
asyncio.run(extract())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment