Last active
November 7, 2017 12:05
-
-
Save rmitsch/1a117e1d0a027ef36cf94b8f8146bce2 to your computer and use it in GitHub Desktop.
3_1_PreDeCon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Exercise 3-1 for course Data Mining at University of Vienna, winter semester 2017. | |
Note that this code implementing some definitions for PreDeCon serves demonstration purposes and is not optimized for | |
performance. | |
""" | |
import numpy | |
import math | |
# 0. Define hyperparameters. | |
param_mu = 3 | |
param_epsilon = 1 | |
param_theta = 0.25 | |
param_lambda = 1 | |
param_kappa = 100 | |
# ------------------------------------------------------- # | |
# 1. Define points. | |
# Sequence: Ascending from P1 to P12. | |
D = numpy.array([[1, 6], [2, 6], [3, 6], [4, 6], [5, 6], [6, 6], [7, 8], [7, 7], [7, 6], [7, 5], [7, 4], [7, 3]]) | |
num_points = len(D) | |
num_dimensions = len(D[0]) | |
# ------------------------------------------------------- # | |
# 3. Gather epsilon-neighbourhood for each point. | |
epsilon_neighbourhoods = [] | |
# Iterate over points. | |
for i in range(0, num_points): | |
epsilon_neighbourhood_i = [] | |
# Check all other points. | |
for j in range(0, num_points): | |
# Append to neighbourhood array if norm is smaller then epsilon. | |
if numpy.linalg.norm(D[i] - D[j]) <= param_epsilon: | |
epsilon_neighbourhood_i.append(j) | |
epsilon_neighbourhoods.append(epsilon_neighbourhood_i) | |
# ------------------------------------------------------- # | |
# 4. Calculate attribute variance. | |
attribute_variances = [[] for i in range(0, num_points)] | |
# Loop over dimensions/attributes. | |
for i_attribute in range(0, num_dimensions): | |
# Loop over points. | |
for i_datapoint in range(0, num_points): | |
# Get number of epsilon-neighbours. | |
number_of_epsilon_neighbours = len(epsilon_neighbourhoods[i_datapoint]) | |
# Store sum of squared distances to neighbours. | |
distance_sum = 0 | |
# Loop over neighbouring points. | |
for i_datapoint_neighbour in epsilon_neighbourhoods[i_datapoint]: | |
# Calculate distance between origin point and epsilon-neighbour. | |
distance_sum += (D[i_datapoint][i_attribute] - D[i_datapoint_neighbour][i_attribute]) ** 2 | |
# Add variance to collection. | |
attribute_variances[i_datapoint].append(distance_sum / number_of_epsilon_neighbours) | |
# ------------------------------------------------------- # | |
# 5. Calculate subspace preference dimensionality. | |
subspace_preference_dimensionalities = [] | |
for i_datapoint in range(0, num_points): | |
# Sum up number of this datapoint's attribute projections with variance <= theta. | |
subspace_preference_dimensionalities.append( | |
sum(att_var <= param_theta for att_var in attribute_variances[i_datapoint]) | |
) | |
# ------------------------------------------------------- # | |
# 6. Calculate subspace preference vectors. | |
subspace_preference_vectors = [[] for i in range(0, num_points)] | |
for i_datapoint in range(0, num_points): | |
subspace_preference_vectors[i_datapoint] = \ | |
[1 if att_var > param_theta else param_kappa for att_var in attribute_variances[i_datapoint]] | |
# ------------------------------------------------------- # | |
# 7. Calculate preference weighted similarity measures. between all datapoints. | |
pref_weighted_similarity_measures = [[] for i in range(0, num_points)] | |
for i_datapoint in range(0, num_points): | |
for j_datapoint in range(0, num_points): | |
if i_datapoint != j_datapoint: | |
pref_weighted_similarity_measures[i_datapoint].append( | |
math.sqrt( | |
sum( | |
( | |
1.0 / subspace_preference_vectors[i_datapoint][i_attribute] * | |
((D[i_datapoint][i_attribute] - D[j_datapoint][i_attribute]) ** 2) | |
) | |
for i_attribute in range(0, num_dimensions) | |
) | |
)) | |
else: | |
pref_weighted_similarity_measures[i_datapoint].append(0.0) | |
# ------------------------------------------------------- # | |
# 8. Preference weighted epsilon-neighbourhood. | |
preference_weighted_neighbourhoods = [[] for i in range(0, num_points)] | |
for i_datapoint in range(0, num_points): | |
for j_datapoint in range(0, num_points): | |
dist_pref = max( | |
pref_weighted_similarity_measures[i_datapoint][j_datapoint], | |
pref_weighted_similarity_measures[j_datapoint][i_datapoint] | |
) | |
if dist_pref <= param_epsilon: | |
preference_weighted_neighbourhoods[i_datapoint].append(j_datapoint) | |
# ------------------------------------------------------- # | |
# 9. Determine core points. | |
core_points = [] | |
for i_datapoint in range(0, num_points): | |
# Subspace preference dimensionality <= lambda? | |
if subspace_preference_dimensionalities[i_datapoint] <= param_lambda and \ | |
len(preference_weighted_neighbourhoods[i_datapoint]) >= param_mu: | |
core_points.append(i_datapoint) | |
# ------------------------------------------------------- # | |
print("--------------------------------------------") | |
print("Core points:", core_points) | |
print("--------------------------------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
R3 allows to a point to be in it's own epsilon-neighbourhood (as opposed to R2).
Results:
Core points: [1, 2, 3, 4, 5, 7, 8, 9, 10]
Core points: [8]