-
-
Save lobstrio/972a60304517e12d85682beee4d61703 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from bs4 import BeautifulSoup | |
from scrapy import Selector | |
import csv | |
import datetime | |
def extract(url): | |
""" | |
Export all cryptodata from cryptomarketcap.com | |
website | |
Arguments: | |
url (str): | |
url of the aimed Coinmarketcap page | |
Return: | |
.csv file | |
""" | |
# INITIALISATION | |
r = requests.session() | |
start = datetime.datetime.now() | |
# COLLECTE DU CODE SOURCE | |
for retry in range(10): # ajout d'un retrier en cas de site inaccessible | |
response = r.get(url=url) | |
print(response.headers) | |
print("-- STATUS CODE --") | |
print(response.status_code) | |
# PARSING ET CREATION DU CSV | |
if response.status_code == 200: | |
with open("/path/to/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f: | |
fieldnames = ['Nom', 'Symbole', 'Cap. marché', 'Prix', 'Offre en circulation', 'Volume (24h)', '% 1h', '% 24h', '7 j'] | |
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') | |
writer.writeheader() | |
soup = BeautifulSoup(response.text, features='html.parser') | |
sel = Selector(text=soup.prettify()) | |
cryptos = sel.xpath("//tr[contains(@id, 'id-')]").extract() | |
print(cryptos) | |
for crypto in cryptos: | |
soup = BeautifulSoup(crypto, features='html.parser') | |
sel = Selector(text=soup.prettify()) | |
nom = sel.xpath("//td[contains(@class, 'currency-name')]/@data-sort").extract_first() | |
symbole = sel.xpath("//td[contains(@class, 'col-symbol')]/text()").extract_first() | |
cap_marche = sel.xpath("//td[contains(@class, 'market-cap')]/text()").extract_first() | |
prix = sel.xpath("//a[@class='price']/@data-usd").extract_first() | |
offre_circulation = sel.xpath("//a[@class='volume']/@data-usd").extract_first() | |
volume = sel.xpath("//td[contains(@class, 'circulating-supply')]/@data-sort").extract_first() | |
percent_1h = sel.xpath("//td[@data-timespan='1h']/@data-sort").extract_first() | |
percent_24h = sel.xpath("//td[@data-timespan='24h']/@data-sort").extract_first() | |
percent_7j = sel.xpath("//td[@data-timespan='7d']/@data-sort").extract_first() | |
clean_values = [] | |
values = [nom, symbole, cap_marche, prix, offre_circulation, volume, percent_1h, percent_24h, percent_7j] | |
for value in values: | |
if value: | |
value = value.strip().replace('\n', '') | |
clean_values.append(value) | |
print(', '.join(clean_values)) | |
dict_row = dict(zip(fieldnames, clean_values)) | |
writer.writerow(dict_row) | |
# TEMPS PASSE | |
end = datetime.datetime.now() | |
time_elapsed = str(end - start) | |
print('\n') | |
print('-- TIME ELAPSED --') | |
print(time_elapsed) | |
break | |
elif response.status_code == 404: | |
print("Page indisponible") | |
break | |
else: | |
print("Impossible d'accéder à la page") | |
return [] | |
def main(): | |
url = "https://coinmarketcap.com/fr/all/views/all/" | |
extract(url) | |
if __name__ == '__main__': | |
main() |
same here
The code is correct in principle but the problem is the server side has such screen scraping against terms of service, and so obfuscates the html and xml class tags and id tags, which causes a failure because the HTML has changed format and the selectors don't find anything, Then the beautifulsoup loop returns no matching criteria.
Here's some hints to hack the breath of life back into it:
cryptos = sel.xpath("//tr[contains(@class, 'cmc-table-row')]").extract()
and
nom = sel.xpath("//a[contains(@class, 'cmc-table__column-name--name')]/text()").extract_first()
Works for me at the moment, but by the time you read this, it will have probably changed again. Better to reach into your wallet and pay cash-money for the unlimited coinbase API.
See: https://gist.github.com/sentientmachine/820e0c47b07fe90580e4c758d9c5df97
Update: scraping coinmarketcap is too much work. Sustainable solution is to get an account at an exchange like coinbase, get a developer API key and get that way. Mostly plug and play, much easier than playing calvinball with this html javascript https://developers.coinbase.com/docs/wallet/guides/price-data
@sentientmachine thanks
Same issue here:( is there any thing to do to fix it ?