Last active
October 18, 2017 13:37
-
-
Save rmitsch/8a2be762de94cbfe8051749421a0cd0b to your computer and use it in GitHub Desktop.
Implementation of exercise 2-3 for VU Data Mining at University of Vienna.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import scipy | |
import scipy.io.arff | |
import sklearn.feature_selection as skfs | |
from sklearn.cluster import KMeans | |
from sklearn.metrics.cluster import normalized_mutual_info_score | |
def arff_to_ndarray(path_to_arff_file): | |
""" | |
Converts content of .arff file to numpy matrix. | |
:param path_to_arff_file: | |
:return: numpy.ndarray matrix for feature values, vector with labels/classes. | |
""" | |
# Load as numpy objects. | |
data, meta = scipy.io.arff.loadarff(path_to_arff_file) | |
# Extract labels. | |
labels = data[meta.names()[-1]] | |
# Discard last column (labels). | |
data = data[meta.names()[:-1]] | |
# Use view(numpy.float) to convert elements from numpy.void to numpy.float. Use -1 to let numpy infer the shape. | |
data = data.view(numpy.float).reshape(data.shape + (-1,)) | |
return data, labels | |
# Load dataset. | |
X, y = arff_to_ndarray("diabetes.arff") | |
# 1. Forward selection, mesaured by chi2-measure (independency of feature variable from class variable). | |
X_chi2 = skfs.SelectKBest(skfs.chi2, k=2).fit_transform(X, y) | |
# 2. Forward selection, measured by mutual information (similarity of feature and target variable). | |
X_mi = skfs.SelectKBest(skfs.mutual_info_classif, k=2).fit_transform(X, y) | |
# 3. Apply k-means with first two data points as initial clusters. | |
kmeans = KMeans(n_clusters=2, init=X[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X) | |
kmeans_chi2 = KMeans(n_clusters=2, init=X_chi2[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X_chi2) | |
kmeans_mi = KMeans(n_clusters=2, init=X_mi[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X_mi) | |
# 4. Evaluate results using NMI between actual predicted labels with different feature selection approaches. | |
# Convert textual labels to 1/0 associations. | |
y_1_to_pos = numpy.asarray([1 if label == b'tested_positive' else 0 for label in y]) | |
y_0_to_pos = numpy.asarray([0 if label == b'tested_positive' else 1 for label in y]) | |
# Evaluate results for original dataset with all features. | |
print(normalized_mutual_info_score(y_0_to_pos, kmeans.labels_)) # 0.0297237939655 | |
# Evaluate results for chi2-reduced dataset. | |
print(normalized_mutual_info_score(y_0_to_pos, kmeans_chi2.labels_)) # 0.0303491479031 | |
# Evaluate results for MI-reduced dataset. | |
print(normalized_mutual_info_score(y_0_to_pos, kmeans_mi.labels_)) # 0.139509464881 | |
# Cross-check: Calculate accuracy. | |
print("---------------") | |
print(numpy.sum(y_0_to_pos == kmeans.labels_) / len(y)) # 0.66015625 | |
print(numpy.sum(y_1_to_pos == kmeans.labels_) / len(y)) # 0.33984375 | |
print(numpy.sum(y_0_to_pos == kmeans_chi2.labels_) / len(y)) # 0.66015625 | |
print(numpy.sum(y_1_to_pos == kmeans_chi2.labels_) / len(y)) # 0.33984375 | |
print(numpy.sum(y_0_to_pos == kmeans_mi.labels_) / len(y)) # 0.66015625 | |
print(numpy.sum(y_1_to_pos == kmeans_mi.labels_) / len(y)) # 0.33984375 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment