Created
October 9, 2014 04:54
-
-
Save zarch/abb3993c55fb61b834d1 to your computer and use it in GitHub Desktop.
Sequential forward floating feature selection with Jeffries-Matusita Distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Sequential forward floating feature selection with Jeffries-Matusita Distance | |
============================================================================== | |
Reference: Pudil, P.; Novovicová, J. & Kittler, J. | |
Floating search methods in feature selection Pattern recognition letters, | |
Elsevier, 1994, 15, 1119-1125 | |
Authors: Michele Dalponte & Hans Ole Ørka & Pietro Zambelli | |
Date: Oct, 2014 | |
Usage | |
------ | |
Suppose to have a CSV file that is spaced separated, then you can select the features with: | |
$ python2 fs_pudil.py -m min data.csv | |
""" | |
from __future__ import print_function | |
from itertools import combinations | |
from math import log, exp, sqrt | |
import numpy as np | |
from numpy.linalg import inv, det | |
VERBOSE = True | |
def read(finput, delimiter=None, skip_header=1): | |
"""Read a file of input and return classes and data""" | |
sdata = np.genfromtxt(finput, dtype=str, delimiter=delimiter, | |
skip_header=skip_header) | |
return sdata.T[-1], sdata[:, :-1].astype(float) | |
def fgroup(classes, data, func, labels=None): | |
labels = labels if labels else sorted(set(classes)) | |
res = {} | |
for label in labels: | |
res[label] = func(data[classes == label]) | |
return res | |
def mean(data): | |
return np.mean(data, axis=0) | |
def cov(data): | |
return np.cov(data.T) | |
def v2m(arr, id0, id1): | |
"""Extract values from a covariance matrix. :: | |
>>> arr = np.arange(1, 10).reshape((3, 3)) | |
>>> arr | |
array([[1, 2, 3], | |
[4, 5, 6], | |
[7, 8, 9]]) | |
>>> v2m(arr, (0, 1), (0, 1)) | |
array([[1, 2], | |
[4, 5]]) | |
>>> v2m(arr, (0, 2), (0, 2)) | |
array([[1, 3], | |
[7, 9]]) | |
""" | |
x, y = np.array([(i0, i1) for i0 in id0 for i1 in id1]).T | |
return arr[x, y].reshape((len(id0), len(id1))) | |
def verbose(numb_of_features, dist, fs): | |
if VERBOSE: | |
print("Feature to select: %d" % (numb_of_features)) | |
print("Maximum minimum distance: %.4f" % dist) | |
print("Features selected: %r" % fs) | |
print() | |
def bhat1(f_id, mua, cva, mub, cvb): | |
mu_a, cv_a = mua[f_id], cva[f_id, f_id] | |
mu_b, cv_b = mub[f_id], cvb[f_id, f_id] | |
mu_diff = mu_a - mu_b | |
cv_sum = cv_a + cv_b | |
return (1. / 8. * mu_diff * 1./(cv_sum * 0.5) * mu_diff + | |
0.5 * log((cv_sum * 0.5)/sqrt(cv_a * cv_b))) | |
def bhatN(f_id, mua, cva, mub, cvb): | |
mu_a, cv_a = mua[f_id], v2m(cva, f_id, f_id) | |
mu_b, cv_b = mub[f_id], v2m(cvb, f_id, f_id) | |
mu_diff = mu_a - mu_b | |
cv_sum = cv_a + cv_b | |
return (np.dot(np.dot(mu_diff, inv(cv_sum * 0.5)), mu_diff)/8. + | |
0.5 * np.log(det(cv_sum * 0.5) / np.sqrt(det(cv_a) * det(cv_b)))) | |
def jm(f_id, mu, cv, strategy, comb=None, bhat=bhatN): | |
comb = comb if comb else combinations(sorted(mu.keys()), 2) | |
return strategy([sqrt(2.*(1-exp(-bhat(f_id, mu[a], cv[a], mu[b], cv[b])))) | |
for a, b in comb]) | |
def backword(fs, mu, cv, strategy, comb=None): | |
comb = comb if comb else combinations(sorted(mu.keys()), 2) | |
change = list(combinations(fs, len(fs) - 1))[::-1] | |
distance = np.array([jm(np.array(f_id), mu, cv, strategy, comb, bhatN) | |
for f_id in change]) | |
idistmax = distance.argmax() | |
return -1 if (idistmax == len(fs) - 1) else idistmax | |
def seq_forward_floating_fs(classes, data, strategy=np.mean, precision=6): | |
"""Sequential Forward Floating Feature Selection with | |
Jeffries-Matusita Distance. | |
""" | |
labels = sorted(set(classes)) | |
nrows, ncols = data.shape | |
mu = fgroup(classes, data, mean) | |
cv = fgroup(classes, data, cov) | |
########################################################################## | |
# STRART | |
########################################################################## | |
# computing JM for single features | |
classes_comb = list(combinations(sorted(mu.keys()), 2)) | |
print('Compute single features distances') | |
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhat1) | |
for f_id in range(ncols)]) | |
idistmax = dist.argmax() | |
res = {1: dict(features=idistmax, distance=dist[idistmax])} | |
verbose(1, dist[idistmax], [idistmax, ]) | |
# computing JM for two features | |
features_comb = np.array(list(combinations(range(ncols), 2))) | |
print('Compute couple features distances') | |
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhatN) | |
for f_id in features_comb]) | |
idistmax = dist.argmax() | |
fs = features_comb[idistmax] | |
res[2] = dict(features=fs, distance=dist[idistmax]) | |
verbose(2, dist[idistmax], fs) | |
# computing JM for N features | |
i = 3 | |
nfeat = ncols - i | |
check = round(sqrt(2), precision) | |
while (i < nfeat): | |
fslist = list(fs) | |
features_comb = np.array([fslist + [j, ] | |
for j in range(ncols) if j not in fs]) | |
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhatN) | |
for f_id in features_comb]) | |
if dist.max() is np.NaN: | |
return res | |
idistmax = dist.argmax() | |
fs = features_comb[idistmax] | |
verbose(i, dist[idistmax], fs) | |
bw = backword(fs, mu, cv, strategy, classes_comb) | |
if bw != -1: | |
fs = np.array([f for e, f in enumerate(fs) if e != bw]) | |
res[i-1] = dict(features=fs, | |
distance=jm(fs, mu, cv, strategy, classes_comb)) | |
else: | |
i += 1 | |
res[i] = dict(features=fs, distance=dist[idistmax]) | |
if check == round(dist[idistmax], precision): | |
return res | |
return res | |
if __name__ == "__main__": | |
import argparse | |
# Define the parser options | |
parser = argparse.ArgumentParser(description='Sequential Forward Floating' | |
'Feature Selection with ' | |
'Jeffries-Matusita Distance') | |
parser.add_argument('data', type=argparse.FileType('r'), | |
help='CSV file with features data') | |
parser.add_argument('-m', '--method', choices=['mean', 'min', 'median'], | |
default='mean', dest='method', | |
help='Feature selection method') | |
parser.add_argument('-d', '--delimiter', type=str, default=' ', | |
dest='delimiter', help='CSV delimiter') | |
parser.add_argument('-s', '--skip-header', type=int, default=1, | |
dest='skip_header', help='Skip header rows') | |
args = parser.parse_args() | |
classes, data = read(args.data, args.delimiter, args.skip_header) | |
seq_forward_floating_fs(classes, data, strategy=getattr(np, args.method)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment