duckworthd · September 16, 2013 17:46
diff --git a/cross_validation.py b/cross_validation.py
 '''
 Cross validation selectors
 
 @author: duckworthd
 '''
 
 import numpy as np
 
 class SequentialFolds(object):
  '''
  Cut data into training and testing sets by creating a rolling time window.
  For example, 
     
    train = [January, February], test = [March]
    train = [Feburary, March], test = [April]
    train = [March, April], test=[June]
  '''
  def __init__(self, times, n=5, ratio=3, cumulative=False, indices=True):
    '''
    Parameters
    ----------
    times : array of sortable
        index by which data is sorted.  
    n : integer
        number of splits desired
    ratio : integer
        training size is >= len(test size)*ratio
    cumulative: boolean
        if false, training size == len(test size)*ratio.  if true, use
        all samples appearing before the test samples
    indices : boolean
        False results in outputting a boolean mask, True in
        a list of integer indices
    '''
    self.index = np.argsort(times)
    self.n = n
    self.ratio = ratio
    self.indices = indices
    self.cumulative = cumulative
   
  def __iter__(self):
    '''
    Returns
    -------
    train : array of boolean or integers
        indices to use for training in either a boolean mask or 
        integer index form
    test : array of boolean or integers
        indices to use for testing in either a boolean mask or 
        integer index form
    '''
    # split data into n pieces
    split_points = np.linspace(start=0, 
                               stop=len(self.index), 
                               num=self.n+self.ratio+1)
    split_points = split_points.astype(int)
     
    for i in range(self.n):
      if self.cumulative:
        train_start = 0
      else:
        train_start = split_points[i]
      train_end = split_points[i+self.ratio]
       
      test_start = train_end
      test_end = split_points[i+self.ratio+1]
       
      # train indices
      train = np.zeros(len(self.index), dtype=np.bool)
      train[ self.index[train_start:train_end] ] = True
      # test indices
      test = np.zeros(len(self.index), dtype=np.bool)
      test[ self.index[test_start:test_end] ] = True
       
      if self.indices:
        ind = np.arange(len(self.index))
        train = ind[train]
        test = ind[test]
      yield (train, test)
   
  def __len__(self):
    return self.n
 
 if __name__ == '__main__':
   
  # test SequentialFolds
  import random
  keys = np.arange(50)
  random.shuffle(keys)
   
  print 'Without cumulative:'
  folds = SequentialFolds(keys, n=5, ratio=2, indices=True)
  for (train, test) in folds:
    print 'train: ' + str(np.sort([keys[t] for t in train]))
    print 'test: ' + str(np.sort([keys[t] for t in test]))
     
  print 'With cumulative:'
  folds = SequentialFolds(keys, n=5, ratio=2, indices=True, cumulative=True)
  for (train, test) in folds:
    print 'train: ' + str(np.sort([keys[t] for t in train]))
    print 'test: ' + str(np.sort([keys[t] for t in test]))
	'''
	Cross validation selectors

	@author: duckworthd
	'''

	import numpy as np

	class SequentialFolds(object):
	'''
	Cut data into training and testing sets by creating a rolling time window.
	For example,

	train = [January, February], test = [March]
	train = [Feburary, March], test = [April]
	train = [March, April], test=[June]
	'''
	def __init__(self, times, n=5, ratio=3, cumulative=False, indices=True):
	'''
	Parameters
	----------
	times : array of sortable
	index by which data is sorted.
	n : integer
	number of splits desired
	ratio : integer
	training size is >= len(test size)*ratio
	cumulative: boolean
	if false, training size == len(test size)*ratio. if true, use
	all samples appearing before the test samples
	indices : boolean
	False results in outputting a boolean mask, True in
	a list of integer indices
	'''
	self.index = np.argsort(times)
	self.n = n
	self.ratio = ratio
	self.indices = indices
	self.cumulative = cumulative

	def __iter__(self):
	'''
	Returns
	-------
	train : array of boolean or integers
	indices to use for training in either a boolean mask or
	integer index form
	test : array of boolean or integers
	indices to use for testing in either a boolean mask or
	integer index form
	'''
	# split data into n pieces
	split_points = np.linspace(start=0,
	stop=len(self.index),
	num=self.n+self.ratio+1)
	split_points = split_points.astype(int)

	for i in range(self.n):
	if self.cumulative:
	train_start = 0
	else:
	train_start = split_points[i]
	train_end = split_points[i+self.ratio]

	test_start = train_end
	test_end = split_points[i+self.ratio+1]

	# train indices
	train = np.zeros(len(self.index), dtype=np.bool)
	train[ self.index[train_start:train_end] ] = True
	# test indices
	test = np.zeros(len(self.index), dtype=np.bool)
	test[ self.index[test_start:test_end] ] = True

	if self.indices:
	ind = np.arange(len(self.index))
	train = ind[train]
	test = ind[test]
	yield (train, test)

	def __len__(self):
	return self.n

	if __name__ == '__main__':

	# test SequentialFolds
	import random
	keys = np.arange(50)
	random.shuffle(keys)

	print 'Without cumulative:'
	folds = SequentialFolds(keys, n=5, ratio=2, indices=True)
	for (train, test) in folds:
	print 'train: ' + str(np.sort([keys[t] for t in train]))
	print 'test: ' + str(np.sort([keys[t] for t in test]))

	print 'With cumulative:'
	folds = SequentialFolds(keys, n=5, ratio=2, indices=True, cumulative=True)
	for (train, test) in folds:
	print 'train: ' + str(np.sort([keys[t] for t in train]))
	print 'test: ' + str(np.sort([keys[t] for t in test]))