Created
September 16, 2013 17:46
-
-
Save duckworthd/6584027 to your computer and use it in GitHub Desktop.
Drop-in replacement for `sklearn.cross_validation.KFold` for sequential data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Cross validation selectors | |
@author: duckworthd | |
''' | |
import numpy as np | |
class SequentialFolds(object): | |
''' | |
Cut data into training and testing sets by creating a rolling time window. | |
For example, | |
train = [January, February], test = [March] | |
train = [Feburary, March], test = [April] | |
train = [March, April], test=[June] | |
''' | |
def __init__(self, times, n=5, ratio=3, cumulative=False, indices=True): | |
''' | |
Parameters | |
---------- | |
times : array of sortable | |
index by which data is sorted. | |
n : integer | |
number of splits desired | |
ratio : integer | |
training size is >= len(test size)*ratio | |
cumulative: boolean | |
if false, training size == len(test size)*ratio. if true, use | |
all samples appearing before the test samples | |
indices : boolean | |
False results in outputting a boolean mask, True in | |
a list of integer indices | |
''' | |
self.index = np.argsort(times) | |
self.n = n | |
self.ratio = ratio | |
self.indices = indices | |
self.cumulative = cumulative | |
def __iter__(self): | |
''' | |
Returns | |
------- | |
train : array of boolean or integers | |
indices to use for training in either a boolean mask or | |
integer index form | |
test : array of boolean or integers | |
indices to use for testing in either a boolean mask or | |
integer index form | |
''' | |
# split data into n pieces | |
split_points = np.linspace(start=0, | |
stop=len(self.index), | |
num=self.n+self.ratio+1) | |
split_points = split_points.astype(int) | |
for i in range(self.n): | |
if self.cumulative: | |
train_start = 0 | |
else: | |
train_start = split_points[i] | |
train_end = split_points[i+self.ratio] | |
test_start = train_end | |
test_end = split_points[i+self.ratio+1] | |
# train indices | |
train = np.zeros(len(self.index), dtype=np.bool) | |
train[ self.index[train_start:train_end] ] = True | |
# test indices | |
test = np.zeros(len(self.index), dtype=np.bool) | |
test[ self.index[test_start:test_end] ] = True | |
if self.indices: | |
ind = np.arange(len(self.index)) | |
train = ind[train] | |
test = ind[test] | |
yield (train, test) | |
def __len__(self): | |
return self.n | |
if __name__ == '__main__': | |
# test SequentialFolds | |
import random | |
keys = np.arange(50) | |
random.shuffle(keys) | |
print 'Without cumulative:' | |
folds = SequentialFolds(keys, n=5, ratio=2, indices=True) | |
for (train, test) in folds: | |
print 'train: ' + str(np.sort([keys[t] for t in train])) | |
print 'test: ' + str(np.sort([keys[t] for t in test])) | |
print 'With cumulative:' | |
folds = SequentialFolds(keys, n=5, ratio=2, indices=True, cumulative=True) | |
for (train, test) in folds: | |
print 'train: ' + str(np.sort([keys[t] for t in train])) | |
print 'test: ' + str(np.sort([keys[t] for t in test])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment