Skip to content

Instantly share code, notes, and snippets.

@DustinAlandzes
Created September 2, 2017 01:29
Show Gist options
  • Save DustinAlandzes/df2f03ca940efb6b8b897af47c01ee7f to your computer and use it in GitHub Desktop.
Save DustinAlandzes/df2f03ca940efb6b8b897af47c01ee7f to your computer and use it in GitHub Desktop.
Identifying Highly Correlated Cryptocurrencies using PCA: http://www.quantatrisk.com/2017/03/31/cryptocurrency-portfolio-correlation-pca-python/
# N-CryptoAsset Portfolios: Identifying Highly Correlated
# Cryptocurrencies using PCA
#
# (c) 2017 QuantAtRisk.com, by Pawel Lachowicz
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
from datetime import datetime
import json
from bs4 import BeautifulSoup
import requests
# define some custom colours
grey = .6, .6, .6
def timestamp2date(timestamp):
# function converts a Unix timestamp into Gregorian date
return datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d')
def date2timestamp(date):
# function coverts Gregorian date in a given format to timestamp
return datetime.strptime(date_today, '%Y-%m-%d').timestamp()
def fetchCryptoClose(fsym, tsym):
# function fetches the close-price time-series from cryptocompare.com
# it may ignore USDT coin (due to near-zero pricing)
# daily sampled
cols = ['date', 'timestamp', fsym]
lst = ['time', 'open', 'high', 'low', 'close']
timestamp_today = datetime.today().timestamp()
curr_timestamp = timestamp_today
for j in range(2):
df = pd.DataFrame(columns=cols)
url = "https://min-api.cryptocompare.com/data/histoday?fsym=" + fsym + \
"&tsym=" + tsym + "&toTs=" + str(int(curr_timestamp)) + "&limit=2000"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
dic = json.loads(soup.prettify())
for i in range(1, 2001):
tmp = []
for e in enumerate(lst):
x = e[0]
y = dic['Data'][i][e[1]]
if(x == 0):
tmp.append(str(timestamp2date(y)))
tmp.append(y)
if(np.sum(tmp[-4::]) > 0): # remove for USDT
tmp = np.array(tmp)
tmp = tmp[[0,1,4]] # filter solely for close prices
df.loc[len(df)] = np.array(tmp)
# ensure a correct date format
df.index = pd.to_datetime(df.date, format="%Y-%m-%d")
df.drop('date', axis=1, inplace=True)
curr_timestamp = int(df.ix[0][0])
if(j == 0):
df0 = df.copy()
else:
data = pd.concat([df, df0], axis=0)
data.drop("timestamp", axis=1, inplace=True)
# N-Cryptocurrency Portfolio (tickers)
fsym = ['BTC', 'ETH', 'DASH', 'XMR', 'XRP', 'LTC', 'ETC', 'XEM', 'REP',
'MAID', 'ZEC', 'STEEM', 'GNT', 'FCT', 'ICN', 'DGD',
'WAVES', 'DCR', 'LSK', 'DOGE', 'PIVX']
# vs.
tsym = 'USD'
for e in enumerate(fsym):
print(e[0], e[1])
if(e[0] == 0):
try:
data = fetchCryptoClose(e[1], tsym)
except:
pass
else:
try:
data = data.join(fetchCryptoClose(e[1], tsym))
except:
pass
data = data.astype(float) # ensure values to be floats
# save portfolio to a file (HDF5 file format)
store = pd.HDFStore('portfolio.h5')
store['data'] = data
store.close()
# read in your portfolio from a file
df = pd.read_hdf('portfolio.h5', 'data')
# drop duplicates
df1 = df1.dropna().drop_duplicates()
# portfolio pre-processing
dfP = df[(df.index >= "2017-03-01") & (df.index <= "2017-03-31")]
dfP = dfP.dropna(axis=1, how='any')
m = dfP.mean(axis=0)
s = dfP.std(ddof=1, axis=0)
# normalised time-series as an input for PCA
dfPort = (dfP - m)/s
c = np.cov(dfPort.values.T) # covariance matrix
co = np.corrcoef(dfP.values.T) # correlation matrix
tickers = list(dfP.columns)
plt.figure(figsize=(8,8))
plt.imshow(co, cmap="RdGy", interpolation="nearest")
cb = plt.colorbar()
cb.set_label("Correlation Matrix Coefficients")
plt.title("Correlation Matrix", fontsize=14)
plt.xticks(np.arange(len(tickers)), tickers, rotation=90)
plt.yticks(np.arange(len(tickers)), tickers)
# perform PCA
w, v = np.linalg.eig(c)
ax = plt.figure(figsize=(8,8)).gca()
plt.imshow(v, cmap="bwr", interpolation="nearest")
cb = plt.colorbar()
plt.yticks(np.arange(len(tickers)), tickers)
plt.xlabel("PC Number")
plt.title("PCA", fontsize=14)
# force x-tickers to be displayed as integers (not floats)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
# choose PC-k numbers
k1 = -1 # the last PC column in 'v' PCA matrix
k2 = -2 # the second last PC column
# begin constructing bi-plot for PC(k1) and PC(k2)
# loadings
plt.figure(figsize=(7,7))
plt.grid()
# compute the distance from (0,0) point
dist = []
for i in range(v.shape[0]):
x = v[i,k1]
y = v[i,k2]
plt.plot(x, y, '.k')
plt.plot([0,x], [0,y], '-', color=grey)
d = np.sqrt(x**2 + y**2)
dist.append(d)
# check and save membership of a coin to
# a quarter number 1, 2, 3 or 4 on the plane
quar = []
for i in range(v.shape[0]):
x = v[i,k1]
y = v[i,k2]
d = np.sqrt(x**2 + y**2)
if(d > np.mean(dist) + np.std(dist, ddof=1)):
plt.plot(x, y, '.r', markersize=10)
plt.plot([0,x], [0,y], '-', color=grey)
if((x > 0) and (y > 0)):
quar.append((i, 1))
elif((x < 0) and (y > 0)):
quar.append((i, 2))
elif((x < 0) and (y < 0)):
quar.append((i, 3))
elif((x > 0) and (y < 0)):
quar.append((i, 4))
plt.text(x, y, tickers[i], color='k')
plt.xlabel("PC-" + str(len(tickers)+k1+1))
plt.ylabel("PC-" + str(len(tickers)+k2+1))
for i in range(len(quar)):
# Q1 vs Q3
if(quar[i][1] == 1):
for j in range(len(quar)):
if(quar[j][1] == 3):
plt.figure(figsize=(7,4))
# highly correlated coins according to the PC analysis
print(tickers[quar[i][0]], tickers[quar[j][0]])
ts1 = dfP[tickers[quar[i][0]]] # time-series
ts2 = dfP[tickers[quar[j][0]]]
# correlation metrics and their p_values
slope, intercept, r2, pvalue, _ = stats.linregress(ts1, ts2)
ktau, kpvalue = stats.kendalltau(ts1, ts2)
print(r2, pvalue)
print(ktau, kpvalue)
plt.plot(ts1, ts2, '.k')
xline = np.linspace(np.min(ts1), np.max(ts1), 100)
yline = slope*xline + intercept
plt.plot(xline, yline,'--', color='b') # linear model fit
plt.xlabel(tickers[quar[i][0]])
plt.ylabel(tickers[quar[j][0]])
plt.show()
# Q2 vs Q4
if(quar[i][1] == 2):
for j in range(len(quar)):
if(quar[j][1] == 4):
plt.figure(figsize=(7,4))
print(tickers[quar[i][0]], tickers[quar[j][0]])
ts1 = dfP[tickers[quar[i][0]]]
ts2 = dfP[tickers[quar[j][0]]]
slope, intercept, r2, pvalue, _ = stats.linregress(ts1, ts2)
ktau, kpvalue = stats.kendalltau(ts1, ts2)
print(r2, pvalue)
print(ktau, kpvalue)
plt.plot(ts1, ts2, '.k')
xline = np.linspace(np.min(ts1), np.max(ts1), 100)
yline = slope*xline + intercept
plt.plot(xline, yline,'--', color='b')
plt.xlabel(tickers[quar[i][0]])
plt.ylabel(tickers[quar[j][0]])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment