Skip to content

Instantly share code, notes, and snippets.

#data manipulation
import numpy as np
import pandas as pd
#netowrk analysis
import networkx as nx
#plotting
%matplotlib inline
import matplotlib.pyplot as plt
# read csv file
raw_asset_prices_df = pd.read_csv("asset_prices.csv", index_col='Date')
# get number of rows and columns of the dataset
df_shape = (raw_asset_prices_df.shape)
print(f"There are {df_shape[0]} rows and {df_shape[1]} columns in the dataset")
print(f"Data timeperiod covers: {min(raw_asset_prices_df.index)} to {max(raw_asset_prices_df.index)}")
# show first five rows
raw_asset_prices_df.head()
aliases = pd.read_csv("etf_names.csv",usecols=['Code','ETF Alias'])
#example aliases
display(aliases)
#convert to dictionary
aliases = dict(zip(aliases['Code'],aliases['ETF Alias']))
#rename columns from ETF codes to aliases
raw_asset_prices_df = raw_asset_prices_df.rename(columns=aliases)
# create empty dataframe for log returns information
log_returns_df = pd.DataFrame()
# calculate log returns of each asset
# loop through each column in dataframe and and calculate the daily log returns
# add log returns column to new a dataframe
for col in raw_asset_prices_df.columns:
# dates are given in reverse order so need to set diff to -1.
log_returns_df[col] = np.log(raw_asset_prices_df[col]).diff(-1)
#calculate correlation matrix using inbuilt pandas function
correlation_matrix = log_returns_df.corr()
#show first five rows of the correlation matrix
correlation_matrix.head()
#visualise correlation matrix using a clustered heatmap
display(HTML("<h3>Clustered Heatmap: Correlations between asset price returns</h3>"))
sns.clustermap(correlation_matrix, cmap="RdYlGn")
plt.show()
#convert matrix to list of edges and rename the columns
edges = correlation_matrix.stack().reset_index()
edges.columns = ['asset_1','asset_2','correlation']
#remove self correlations
edges = edges.loc[edges['asset_1'] != edges['asset_2']].copy()
#show the first 5 rows of the edge list dataframe.
edges.head()
#create undirected graph with weights corresponding to the correlation magnitude
G0 = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
#print out the graph info
#check number of nodes and degrees are as expected (all should have degree = 38, i.e. average degree = 38)
print(nx.info(G0))
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,20))
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.circular_layout(G0),ax=ax[0,0])
ax[0,0].set_title("Circular layout")
nx.draw(G0, with_labels=True, node_size=700, node_color="#e1575c",
edge_color='#363847', pos=nx.random_layout(G0),ax=ax[0,1])
ax[0,1].set_title("Random layout")
# 'winner takes all' method - set minium correlation threshold to remove some edges from the diagram
threshold = 0.5
# create a new graph from edge list
Gx = nx.from_pandas_edgelist(edges, 'asset_1', 'asset_2', edge_attr=['correlation'])
# list to store edges to remove
remove = []
# loop through edges in Gx and find correlations which are below the threshold
for asset_1, asset_2 in Gx.edges():