-
-
Save sentientmachine/820e0c47b07fe90580e4c758d9c5df97 to your computer and use it in GitHub Desktop.
Extract all cryptocurrencies data from coinmarketcap.com using Python 3 and Request
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from bs4 import BeautifulSoup | |
from scrapy import Selector | |
import csv | |
import datetime | |
#pip3 install --user bs4 | |
#pip3 install --user scrapy | |
def extract(url): | |
print("Export all cryptodata from cryptomarketcap.com") | |
""" | |
USAGE: | |
Arguments: | |
url (str): | |
url of the aimed Coinmarketcap page | |
Returns: | |
.csv file | |
""" | |
# Initialization | |
r = requests.session() | |
start = datetime.datetime.now() | |
#retry if site is inaccessible | |
for retry in range(10): | |
response = r.get(url=url) | |
print("response is: ") | |
print(response.headers) | |
print("-- STATUS CODE --") | |
print(response.status_code) | |
#I gave up on maintaining this code since screen-scraping is against the terms of coinmarketcap.com | |
#and they work very hard to prevent that. Sustainable solution was for me to make an account with | |
#coinbase.com and use the developer's API key and there's some python code to request ohlcv daily prices | |
#The requets, scrapy and BeautifulSoup isn't strong enough for this job. | |
#They obfuscate the html every few months anyway, to throw you off. | |
print("now do parsing") | |
if response.status_code == 200: | |
print("response code is good") | |
#with open("/path/to/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f: | |
with open("/tmp/coinmarketcap/cryptocurrencies_{}.csv".format(str(datetime.date.today())), "w") as f: | |
fieldnames = ['Nom', 'Symbole', 'Cap. marche', 'Prix', 'Offre en circulation', 'Volume (24h)', '% 1h', '% 24h', '7 j'] | |
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t') | |
writer.writeheader() | |
#print("response.text is") | |
soup = BeautifulSoup(response.text, features='html.parser') | |
sel = Selector(text=soup.prettify()) | |
#cryptos = sel.xpath("//tr[contains(@id, 'id-')]").extract() | |
cryptos = sel.xpath("//tr[contains(@class, 'cmc-table-row')]").extract() | |
print("cryptos list: '" + str(cryptos) + "'") | |
for crypto in cryptos: | |
soup = BeautifulSoup(crypto, features='html.parser') | |
sel = Selector(text=soup.prettify()) | |
print("found a crypto: " + str(crypto)) | |
#nom = sel.xpath("//td[contains(@class, 'currency-name')]/@data-sort").extract_first() | |
nom = sel.xpath("//a[contains(@class, 'cmc-table__column-name--name')]/text()").extract_first() | |
symbole = sel.xpath("//td[contains(@class, 'col-symbol')]/text()").extract_first() | |
cap_marche = sel.xpath("//td[contains(@class, 'market-cap')]/text()").extract_first() | |
prix = sel.xpath("//a[@class='price']/@data-usd").extract_first() | |
offre_circulation = sel.xpath("//a[@class='volume']/@data-usd").extract_first() | |
volume = sel.xpath("//td[contains(@class, 'circulating-supply')]/@data-sort").extract_first() | |
percent_1h = sel.xpath("//td[@data-timespan='1h']/@data-sort").extract_first() | |
percent_24h = sel.xpath("//td[@data-timespan='24h']/@data-sort").extract_first() | |
percent_7j = sel.xpath("//td[@data-timespan='7d']/@data-sort").extract_first() | |
clean_values = [] | |
values = [nom, symbole, cap_marche, prix, offre_circulation, volume, percent_1h, percent_24h, percent_7j] | |
for value in values: | |
if value: | |
value = value.strip().replace('\n', '') | |
clean_values.append(value) | |
#print(', '.join(clean_values)) | |
dict_row = dict(zip(fieldnames, clean_values)) | |
writer.writerow(dict_row) | |
# amount of time elapsed | |
end = datetime.datetime.now() | |
time_elapsed = str(end - start) | |
print('\n') | |
print('-- TIME ELAPSED --') | |
print(time_elapsed) | |
break | |
elif response.status_code == 404: | |
print("Page indisponible") | |
break | |
else: | |
print("Can't load page.") | |
return [] | |
def main(): | |
#url = "https://coinmarketcap.com/fr/all/views/all/" | |
url = "https://coinmarketcap.com/all/views/all/" | |
print("url: '" + str(url) + "'") | |
extract(url) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment