-
-
Save aravindkarnam/536461416de4f66ebd8e922ad6d2ff8e to your computer and use it in GitHub Desktop.
Scrape google playstore reviews with Crawl4AI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy | |
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, BrowserConfig | |
import json | |
import os | |
from playwright.async_api import Page, BrowserContext | |
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): | |
# Called before final HTML retrieval. | |
print("[HOOK] before_retrieve_html - We can do final actions") | |
# Example: Scroll again | |
await page.evaluate(""" | |
(async () => { | |
const element = document.querySelector('.fysCi.Vk3ZVd'); | |
let scrollAttempts = 0; | |
const maxScrollAttempts = 5; | |
while (document.querySelectorAll('.RHo1pe').length < 1000 && scrollAttempts < maxScrollAttempts) { | |
const previousComments = document.querySelectorAll('.RHo1pe').length; | |
element.scrollBy(0, 1500); | |
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds | |
const currentComments = document.querySelectorAll('.RHo1pe').length; | |
if (currentComments <= previousComments) { | |
scrollAttempts++; | |
} else { | |
scrollAttempts = 0; // Reset the counter if new comments are loaded | |
} | |
} | |
})(); | |
""") | |
return page | |
async def extract(): | |
# Define the extraction schema for comments | |
schema = { | |
"name": "Comment", | |
"baseSelector": ".RHo1pe", | |
"fields": [ | |
{"name": "TIME", "selector": "header.c1bOId > div.iXRFPc > span.bp9Aid", "type": "text"}, | |
{"name": "SUMMARY", "selector": "div.h3YV2d", "type": "text"}, | |
] | |
} | |
# JavaScript commands | |
js_commands = [ | |
"window.scrollTo(0, document.body.scrollHeight);", | |
"await new Promise(resolve => setTimeout(resolve, 2000));", | |
""" | |
document.evaluate( | |
"//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-dgl2Hf ksBjEc lKxP2d LQeN7 aLey0c']//span[@class='VfPpkd-vQzf8d']", | |
document, | |
null, | |
XPathResult.FIRST_ORDERED_NODE_TYPE, | |
null | |
).singleNodeValue?.parentElement?.click(); | |
""", | |
] | |
# Browser configuration with custom headers and settings | |
browser_config = BrowserConfig( | |
headless=False, | |
headers={ | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Language": "en-US,en;q=0.5", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Connection": "keep-alive", | |
"Upgrade-Insecure-Requests": "1", | |
} | |
) | |
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) | |
try: | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) | |
config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
extraction_strategy=extraction_strategy, | |
scan_full_page=True, | |
scroll_delay=3.0, | |
js_code=js_commands, | |
delay_before_return_html=3.0, | |
page_timeout=1200000, # Increase timeout to 120 seconds | |
wait_until="networkidle" # Wait until network is idle | |
) | |
result = await crawler.arun( | |
url="https://play.google.com/store/apps/details?id=us.nobarriers.elsa&hl=vi", | |
config=config | |
) | |
if not result.success: | |
print(f"Crawl failed: {result.error_message}") | |
return | |
if not result.extracted_content: | |
print("No content was extracted") | |
return | |
# Save the extracted content | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
output_file = os.path.join(script_dir, "auto_load_comments.json") | |
articles = json.loads(result.extracted_content) | |
with open(output_file, "w", encoding="utf-8") as json_file: | |
json.dump(articles, json_file, ensure_ascii=False, indent=4) | |
print(f"Data has been saved to {output_file}") | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
print(f"Error type: {type(e)}") | |
if __name__ == "__main__": | |
asyncio.run(extract()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment