Last active
March 21, 2020 08:25
-
-
Save hxhc/a8d6e8858a6f11a64e249c94bda474e9 to your computer and use it in GitHub Desktop.
spectra sample set split methods including random split, Kennard-Stone split and SPXY split. Max minimum distance split which is the core of Kennard-Stone split and SPXY split is also implemented as a function.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding=utf-8 -*- | |
from __future__ import division, print_function | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from scipy.spatial.distance import cdist | |
def random_split(spectra, test_size=0.25, random_state=None, shuffle=True, stratify=None): | |
"""implement random_split by using sklearn.model_selection.train_test_split function. See | |
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html | |
for more infomation. | |
""" | |
return train_test_split( | |
spectra, | |
test_size=test_size, | |
random_state=random_state, | |
shuffle=shuffle, | |
stratify=stratify) | |
def kennardstone(spectra, test_size=0.25, metric='euclidean', *args, **kwargs): | |
"""Kennard Stone Sample Split method | |
Parameters | |
---------- | |
spectra: ndarray, shape of i x j | |
i spectrums and j variables (wavelength/wavenumber/ramam shift and so on) | |
test_size : float, int | |
if float, then round(i x (1-test_size)) spectrums are selected as test data, by default 0.25 | |
if int, then test_size is directly used as test data size | |
metric : str, optional | |
The distance metric to use, by default 'euclidean' | |
See scipy.spatial.distance.cdist for more infomation | |
Returns | |
------- | |
select_pts: list | |
index of selected spetrums as train data, index is zero based | |
remaining_pts: list | |
index of remaining spectrums as test data, index is zero based | |
References | |
-------- | |
Kennard, R. W., & Stone, L. A. (1969). Computer aided design of experiments. | |
Technometrics, 11(1), 137-148. (https://www.jstor.org/stable/1266770) | |
""" | |
if test_size < 1: | |
train_size = round(spectra.shape[0] * (1 - test_size)) | |
else: | |
train_size = spectra.shape[0] - round(test_size) | |
if train_size > 2: | |
distance = cdist(spectra, spectra, metric=metric, *args, **kwargs) | |
select_pts, remaining_pts = max_min_distance_split(distance, train_size) | |
else: | |
raise ValueError("train sample size should be at least 2") | |
return select_pts, remaining_pts | |
def spxy(spectra, yvalues, test_size=0.25, metric='euclidean', *args, **kwargs): | |
"""SPXY Sample Split method | |
Parameters | |
---------- | |
spectra: ndarray, shape of i x j | |
i spectrums and j variables (wavelength/wavenumber/ramam shift and so on) | |
test_size : float, int | |
if float, then round(i x (1-test_size)) spectrums are selected as test data, by default 0.25 | |
if int, then test_size is directly used as test data size | |
metric : str, optional | |
The distance metric to use, by default 'euclidean' | |
See scipy.spatial.distance.cdist for more infomation | |
Returns | |
------- | |
select_pts: list | |
index of selected spetrums as train data, index is zero based | |
remaining_pts: list | |
index of remaining spectrums as test data, index is zero based | |
References | |
--------- | |
Galvao et al. (2005). A method for calibration and validation subset partitioning. | |
Talanta, 67(4), 736-740. (https://www.sciencedirect.com/science/article/pii/S003991400500192X) | |
""" | |
if test_size < 1: | |
train_size = round(spectra.shape[0] * (1 - test_size)) | |
else: | |
train_size = spectra.shape[0] - round(test_size) | |
if train_size > 2: | |
yvalues = yvalues.reshape(yvalues.shape[0], -1) | |
distance_spectra = cdist(spectra, spectra, metric=metric, *args, **kwargs) | |
distance_y = cdist(yvalues, yvalues, metric=metric, *args, **kwargs) | |
distance_spectra = distance_spectra / distance_spectra.max() | |
distance_y = distance_y / distance_y.max() | |
distance = distance_spectra + distance_y | |
select_pts, remaining_pts = max_min_distance_split(distance, train_size) | |
else: | |
raise ValueError("train sample size should be at least 2") | |
return select_pts, remaining_pts | |
def max_min_distance_split(distance, train_size): | |
"""sample set split method based on maximun minimun distance, which is the core of Kennard Stone | |
method | |
Parameters | |
---------- | |
distance : distance matrix | |
semi-positive real symmetric matrix of a certain distance metric | |
train_size : train data sample size | |
should be greater than 2 | |
Returns | |
------- | |
select_pts: list | |
index of selected spetrums as train data, index is zero-based | |
remaining_pts: list | |
index of remaining spectrums as test data, index is zero-based | |
""" | |
select_pts = [] | |
remaining_pts = [x for x in range(distance.shape[0])] | |
# first select 2 farthest points | |
first_2pts = np.unravel_index(np.argmax(distance), distance.shape) | |
select_pts.append(first_2pts[0]) | |
select_pts.append(first_2pts[1]) | |
# remove the first 2 points from the remaining list | |
remaining_pts.remove(first_2pts[0]) | |
remaining_pts.remove(first_2pts[1]) | |
for i in range(train_size - 2): | |
# find the maximum minimum distance | |
select_distance = distance[select_pts, :] | |
min_distance = select_distance[:, remaining_pts] | |
min_distance = np.min(min_distance, axis=0) | |
max_min_distance = np.max(min_distance) | |
# select the first point (in case that several distances are the same, choose the first one) | |
points = np.argwhere(select_distance == max_min_distance)[:, 1].tolist() | |
for point in points: | |
if point in select_pts: | |
pass | |
else: | |
select_pts.append(point) | |
remaining_pts.remove(point) | |
break | |
return select_pts, remaining_pts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment