-
-
Save aravindkarnam/e4c8b44b93ce4adaf36f1f0a4704fe82 to your computer and use it in GitHub Desktop.
Get crypto currency directory from coinbase with crawl4ai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
from crawl4ai import ( | |
AsyncWebCrawler, | |
CacheMode, | |
CrawlerRunConfig, | |
LLMConfig, | |
JsonCssExtractionStrategy, | |
) | |
import time | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, # cache_mode=CacheMode.BYPASS/DISABLED, | |
screenshot=True, | |
css_selector="table.cmc-table", | |
) | |
async def generate_schema(): | |
llm_config = LLMConfig( | |
provider="gemini/gemini-1.5-flash", api_token="env:GEMINI_API_KEY" | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://coinmarketcap.com/", | |
config=crawler_config, | |
) | |
print(result.cleaned_html) | |
schema = JsonCssExtractionStrategy.generate_schema( | |
llm_config=llm_config, | |
html=result.html, | |
schema_type="CSS", | |
target_json_example="""[{ | |
"name": "Bitcoin", | |
"symbol": "BTC", | |
"price": "$50,000", | |
"1h_change": "+1.5%", | |
"24h_change": "+2.5%", | |
"7d_change": "+5.5%", | |
"market_cap": "$1,000,000", | |
"volume_24h": "$100,000", | |
"circulating_supply": "18,000,000", | |
"7d_chart": "https://www.coindesk.com/wp-content/uploads/2013/12/shutterstock | |
_773594696.jpg" | |
},{ | |
"name": "Ethereum", | |
"symbol": "ETH", | |
"price": "$3,000", | |
"1h_change": "+1.5%", | |
"24h_change": "+2.5%", | |
"7d_change": "+5.5%", | |
"market_cap": "$300,000", | |
"volume_24h": "$30,000", | |
"circulating_supply": "100,000,000", | |
"7d_chart": "https://www.coindesk.com/wp-content/uploads/2013/12/shutterstock | |
_773594696.jpg" | |
...]""", | |
) | |
print(schema) | |
return schema | |
async def get_data(schema): | |
async with AsyncWebCrawler() as crawler: | |
currencies = [] | |
start_time = time.time() | |
for i in range(1, 100): | |
result = await crawler.arun( | |
url=f"https://coinmarketcap.com/?page={i}", | |
config=crawler_config.clone( | |
extraction_strategy=JsonCssExtractionStrategy(schema=schema), | |
scan_full_page=True, | |
sroll_delay=1, | |
), | |
) | |
data = json.loads(result.extracted_content) | |
currencies.extend(data) | |
end_time = time.time() | |
print(f"Time taken to extract {len(currencies)} currencies: {end_time - start_time}") | |
return currencies | |
async def main(): | |
# schema = await generate_schema() | |
# Run the above line for the first time to generate the schema, then copy the schema to variable and comment it out. | |
# You don't have to call it everytime | |
schema = {'name': 'CoinMarketCap Cryptocurrency List', 'baseSelector': 'table.cmc-table tbody tr', 'fields': [{'name': 'rank', 'selector': 'td:nth-child(2) p', 'type': 'text'}, {'name': 'name', 'selector': 'td:nth-child(3) a div p.coin-item-name', 'type': 'text'}, {'name': 'symbol', 'selector': 'td:nth-child(3) a div p.coin-item-symbol', 'type': 'text'}, {'name': 'price', 'selector': 'td:nth-child(4) span', 'type': 'text'}, {'name': '1h_change', 'selector': 'td:nth-child(5) span', 'type': 'text'}, {'name': '24h_change', 'selector': 'td:nth-child(6) span', 'type': 'text'}, {'name': '7d_change', 'selector': 'td:nth-child(7) span', 'type': 'text'}, {'name': 'market_cap', 'selector': 'td:nth-child(8) p span.sc-11478e5d-0', 'type': 'text'}, {'name': 'volume_24h', 'selector': 'td:nth-child(9) a p', 'type': 'text'}, {'name': 'circulating_supply', 'selector': 'td:nth-child(10) div span', 'type': 'text'}, {'name': '7d_chart', 'selector': 'td:nth-child(11) img', 'type': 'attribute', 'attribute': 'src'}]} | |
data = await get_data(schema) | |
print(f"retreived {len(data)} records") | |
if __name__ == "__main__": | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment