Skip to content

Instantly share code, notes, and snippets.

@aravindkarnam
Last active March 7, 2025 10:05
Show Gist options
  • Save aravindkarnam/e4c8b44b93ce4adaf36f1f0a4704fe82 to your computer and use it in GitHub Desktop.
Save aravindkarnam/e4c8b44b93ce4adaf36f1f0a4704fe82 to your computer and use it in GitHub Desktop.
Get crypto currency directory from coinbase with crawl4ai
import asyncio
import json
from crawl4ai import (
AsyncWebCrawler,
CacheMode,
CrawlerRunConfig,
LLMConfig,
JsonCssExtractionStrategy,
)
import time
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # cache_mode=CacheMode.BYPASS/DISABLED,
screenshot=True,
css_selector="table.cmc-table",
)
async def generate_schema():
llm_config = LLMConfig(
provider="gemini/gemini-1.5-flash", api_token="env:GEMINI_API_KEY"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://coinmarketcap.com/",
config=crawler_config,
)
print(result.cleaned_html)
schema = JsonCssExtractionStrategy.generate_schema(
llm_config=llm_config,
html=result.html,
schema_type="CSS",
target_json_example="""[{
"name": "Bitcoin",
"symbol": "BTC",
"price": "$50,000",
"1h_change": "+1.5%",
"24h_change": "+2.5%",
"7d_change": "+5.5%",
"market_cap": "$1,000,000",
"volume_24h": "$100,000",
"circulating_supply": "18,000,000",
"7d_chart": "https://www.coindesk.com/wp-content/uploads/2013/12/shutterstock
_773594696.jpg"
},{
"name": "Ethereum",
"symbol": "ETH",
"price": "$3,000",
"1h_change": "+1.5%",
"24h_change": "+2.5%",
"7d_change": "+5.5%",
"market_cap": "$300,000",
"volume_24h": "$30,000",
"circulating_supply": "100,000,000",
"7d_chart": "https://www.coindesk.com/wp-content/uploads/2013/12/shutterstock
_773594696.jpg"
...]""",
)
print(schema)
return schema
async def get_data(schema):
async with AsyncWebCrawler() as crawler:
currencies = []
start_time = time.time()
for i in range(1, 100):
result = await crawler.arun(
url=f"https://coinmarketcap.com/?page={i}",
config=crawler_config.clone(
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
scan_full_page=True,
sroll_delay=1,
),
)
data = json.loads(result.extracted_content)
currencies.extend(data)
end_time = time.time()
print(f"Time taken to extract {len(currencies)} currencies: {end_time - start_time}")
return currencies
async def main():
# schema = await generate_schema()
# Run the above line for the first time to generate the schema, then copy the schema to variable and comment it out.
# You don't have to call it everytime
schema = {'name': 'CoinMarketCap Cryptocurrency List', 'baseSelector': 'table.cmc-table tbody tr', 'fields': [{'name': 'rank', 'selector': 'td:nth-child(2) p', 'type': 'text'}, {'name': 'name', 'selector': 'td:nth-child(3) a div p.coin-item-name', 'type': 'text'}, {'name': 'symbol', 'selector': 'td:nth-child(3) a div p.coin-item-symbol', 'type': 'text'}, {'name': 'price', 'selector': 'td:nth-child(4) span', 'type': 'text'}, {'name': '1h_change', 'selector': 'td:nth-child(5) span', 'type': 'text'}, {'name': '24h_change', 'selector': 'td:nth-child(6) span', 'type': 'text'}, {'name': '7d_change', 'selector': 'td:nth-child(7) span', 'type': 'text'}, {'name': 'market_cap', 'selector': 'td:nth-child(8) p span.sc-11478e5d-0', 'type': 'text'}, {'name': 'volume_24h', 'selector': 'td:nth-child(9) a p', 'type': 'text'}, {'name': 'circulating_supply', 'selector': 'td:nth-child(10) div span', 'type': 'text'}, {'name': '7d_chart', 'selector': 'td:nth-child(11) img', 'type': 'attribute', 'attribute': 'src'}]}
data = await get_data(schema)
print(f"retreived {len(data)} records")
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment