Created
November 13, 2017 20:41
-
-
Save rmitsch/7bb1364a4c6d637cfcb5d2b4a055cf75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from numpy.linalg import eigh | |
import math | |
# reading csv Data into a File | |
def read_data(file): | |
f = open( file, 'r') | |
data = [] | |
for line in f: | |
line = line.split(';') | |
del line[-1] | |
vec = np.zeros(len(line)) | |
c = 0 | |
for x in line: | |
vec[c] = x | |
c += 1 | |
data.append(vec) | |
f.close() | |
return data | |
def eps_range(q, D, epsilon): | |
""" | |
(b) | |
:param q: | |
:param D: | |
:param epsilon: | |
:return: | |
""" | |
neighbourhoods = [] | |
for neighbour_index in range(0, len(D)): | |
if np.linalg.norm(q - D[neighbour_index]) <= epsilon: | |
neighbourhoods.append(np.array(D[neighbour_index])) | |
return neighbourhoods | |
def derive_local_correlations(D, epsilon, kappa, delta): | |
""" | |
(c). See definition #10 and introduction to 3.3: "...which is determined by PCA of the points in the epsilon- | |
neighborhood of P." - eigenvalues and eigenvectors for PCA are calculated on point set of points' corresponding | |
epsilon-neighbourhood. | |
:param D: | |
:param epsilon: | |
:param kappa: | |
:param delta: | |
:return: | |
""" | |
local_correlations = [] | |
for index in range(0, len(D)): | |
# Calculate eigenvalues and eigenvectors based of covariance matrix of D[i]'s neighbourhood. | |
eigenvalues, eigenvectors = eigh( | |
# rowvar = False since rows correspond to datapoints/observatrions and columns to features/variable. | |
np.cov(eps_range(D[index], D, epsilon), rowvar=False) | |
) | |
# Invert normalized eigenvalues. | |
# Note that eigenvalues have to be divided by their maximum, which is conveniently not described in detail in | |
# the actual definition. | |
new_eigenvalues = np.array( | |
[1 if normed_eigenvalue > delta else kappa | |
for normed_eigenvalue in np.divide(eigenvalues, np.max(eigenvalues))] | |
) | |
correlation_similarity_matrix = eigenvectors @ np.diag(new_eigenvalues) @ np.transpose(eigenvectors) | |
local_correlations.append(correlation_similarity_matrix) | |
# print("Correlation similarity matrix:\n", correlation_similarity_matrix) | |
return local_correlations | |
def correlation_dist(x, S1, y, S2): | |
""" | |
(d) See definition #10 (correlation similarity matrix). | |
Assuming max() is the operation for combining distances and correlation similarity matrix equals "local distance | |
matrix". | |
:param x: | |
:param S1: | |
:param y: | |
:param S2: | |
:return: | |
""" | |
return max( | |
calculate_correlation_similarity(x, y, S1), | |
calculate_correlation_similarity(y, x, S2) | |
) | |
def calculate_correlation_similarity(x, y, correlation_similarity_matrix_for_x): | |
""" | |
Calculates correlation similarity used for correlation distance (def. #10). | |
:param x: | |
:param y: | |
:param correlation_similarity_matrix_for_x: | |
:return: | |
""" | |
diff_xy = np.subtract(x, y) | |
# @ for vector-matrix multiplication. | |
return np.sqrt(diff_xy @ correlation_similarity_matrix_for_x @ np.transpose(diff_xy)) | |
def corr_eps_range(query_id, D, epsilon, mats): | |
""" | |
(e) See definition #11. Assuming corr_eps_range is to return the correlation epsilon-neighbourhood, since not properly documented in | |
assignment. | |
:param query_id: | |
:param D: | |
:param epsilon: | |
:param mats: | |
:return: | |
""" | |
corr_eps_neighbourhood = [] | |
for index in range(1, len(D)): | |
correlation_distance = correlation_dist( | |
D[query_id], | |
mats[query_id], | |
D[index], | |
mats[index] | |
) | |
if correlation_distance <= epsilon: | |
corr_eps_neighbourhood.append(index) | |
return corr_eps_neighbourhood | |
def expand(outer,D,eps, minPts, cur_cluster_ID,mats,processed): | |
seed = corr_eps_range(outer, D, eps, mats) | |
# return if no core point | |
if len(seed) < minPts: | |
return False | |
# outer is core point | |
processed[outer] = cur_cluster_ID | |
while seed: | |
q = seed.pop() | |
if processed[q] == 0: | |
range = corr_eps_range(q,D,eps,mats) | |
if len(range) >= minPts: | |
for x in range: | |
seed.append(x) | |
processed[q] = cur_cluster_ID | |
return True | |
def algorithm4c(eps, minpts, kappa, delta): | |
# Note: Didn't run with Python 3.5, hence the variable definition was moved outside the function. | |
global processed, cur_cluster_ID | |
# build local distance measures | |
cor_mat = derive_local_correlations(D, eps, kappa, delta) | |
print('Finished preprocessing') | |
# run 4C | |
processed = np.zeros(n) | |
cur_cluster_ID = 1 | |
for outer in range(0, n): | |
if processed[outer] == 0: | |
if expand(outer, D, eps, minpts, cur_cluster_ID, cor_mat, processed): | |
print('Found cluster %s' % cur_cluster_ID) | |
cur_cluster_ID += 1 | |
else: | |
processed[outer] = -1 | |
return processed | |
################################ | |
# Prepare and preprocess data. | |
################################ | |
# read data | |
D = read_data('data.csv') | |
# set dimensionality d and number of instances n | |
d = np.shape(D[0])[0] | |
n = len(D) | |
print(n) | |
print(d) | |
xs = [x[0] for x in D] | |
ys = [x[1] for x in D] | |
# plt.plot(xs, ys, 'b.') | |
# plt.show() | |
# Declare global variables. | |
processed = None | |
cur_cluster_ID = None | |
################################ | |
# Execute 4C. | |
################################ | |
# set of local parameters | |
minpts = 20 | |
eps = 0.05 | |
kappa = 50 | |
results = [] | |
# Prepare plotting. | |
cluster_design = ['r.','b.','g.','y.','ro','bo','go','yo','r+','b+','g+','y+','r-','b-','g-','y-'] | |
run_index = 0 | |
delta_values = [1e-10, 1e-1, 0.6, 1] | |
n_rows = math.ceil(math.sqrt(len(delta_values))) | |
n_cols = math.ceil(len(delta_values) / n_rows) | |
fig, subplots = plt.subplots(n_rows, n_cols, sharex=True, sharey=True, squeeze=False) | |
for delta in delta_values: | |
results.append(algorithm4c(eps, minpts, kappa, delta)) | |
# Set subplot's title. | |
subplots[int(run_index / n_rows), run_index % n_rows].set_title(str(delta)) | |
for c in range(1, cur_cluster_ID): | |
x = [] | |
y = [] | |
for i in range(0, n): | |
if c == results[run_index][i]: | |
x.append(D[i][0]) | |
y.append(D[i][1]) | |
# Plot result for current cluster. | |
subplots[int(run_index / n_rows), run_index % n_rows].plot(x, y, cluster_design[(c - 1) % len(cluster_design)]) | |
# Keep track of number of models run so far. | |
run_index += 1 | |
# Show plots. | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment