Shihab-Shahriar · September 8, 2019 12:12 · Shihab-Shahriar · Sep 8, 2019
diff --git a/coarse_vs_fine.py b/coarse_vs_fine.py
 from collections import Counter
 from time import perf_counter
 import numpy as np

 from sklearn.base import ClassifierMixin, clone
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import StratifiedKFold, cross_val_predict
 from sklearn.utils import safe_indexing

 from imblearn.under_sampling import InstanceHardnessThreshold
 from imblearn.utils.deprecation import deprecate_parameter

 class FineGrainedIH(InstanceHardnessThreshold):
    def _validate_estimator(self):
        """Private function to create the classifier"""

        if (self.estimator is not None and
                isinstance(self.estimator, ClassifierMixin) and
                hasattr(self.estimator, 'predict_proba')):
            self.estimator_ = clone(self.estimator)
            self.estimator_.set_params(random_state=self.random_state)
            if 'n_jobs' in self.estimator_.get_params().keys():
                self.estimator_.set_params(n_jobs = self.n_jobs)

        elif self.estimator is None:
            self.estimator_ = RandomForestClassifier(
                n_estimators=100, random_state=self.random_state,
                n_jobs=self.n_jobs)
        else:
            raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
                type(self.estimator)))



 class CoarseGrainedIH(InstanceHardnessThreshold):
    def _validate_estimator(self):
        """Private function to create the classifier"""

        if (self.estimator is not None and
                isinstance(self.estimator, ClassifierMixin) and
                hasattr(self.estimator, 'predict_proba')):
            self.estimator_ = clone(self.estimator)
            self.estimator_.set_params(random_state=self.random_state)
            if 'n_jobs' in self.estimator_.get_params().keys():
                self.estimator_.set_params(n_jobs = 1)

        elif self.estimator is None:
            self.estimator_ = RandomForestClassifier(
                n_estimators=100, random_state=self.random_state,
                n_jobs=self.n_jobs)
        else:
            raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
                type(self.estimator)))

    def _fit_resample(self, X, y):
        if self.return_indices:
            deprecate_parameter(self, '0.4', 'return_indices',
                                'sample_indices_')
        self._validate_estimator()

        target_stats = Counter(y)
        skf = StratifiedKFold(
            n_splits=self.cv, shuffle=False,
            random_state=self.random_state)
        probabilities = cross_val_predict(self.estimator_, X, y, cv=skf,
                                          n_jobs=self.n_jobs, method='predict_proba')
        probabilities = probabilities[range(len(y)), y]

        idx_under = np.empty((0,), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                threshold = np.percentile(
                    probabilities[y == target_class],
                    (1. - (n_samples / target_stats[target_class])) * 100.)
                index_target_class = np.flatnonzero(
                    probabilities[y == target_class] >= threshold)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        self.sample_indices_ = idx_under

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        return safe_indexing(X, idx_under), safe_indexing(y, idx_under)

 def avg_time(est,X,y):
    start = perf_counter()
    for _ in range(10):
        est.fit_resample(X,y)
    return (perf_counter() - start)/10


 if __name__=='__main__':
    from sklearn.datasets import load_digits,load_iris,load_breast_cancer
    SEED = 42
    rf = RandomForestClassifier(n_estimators=1000,random_state=SEED)
    coarse_ih = CoarseGrainedIH(estimator=rf,n_jobs=4,random_state = SEED)
    fine_ih = FineGrainedIH(estimator=rf,n_jobs=4,random_state=SEED)

    for name,dataset in zip(['digits','iris','cancer'],[load_digits,load_iris,load_breast_cancer]):
        X,y = dataset(return_X_y=True)
        print(name)
        print(f"Coarse:",avg_time(coarse_ih,X,y)) 
        print(f"Fine:",avg_time(fine_ih,X,y))
	from collections import Counter
	from time import perf_counter
	import numpy as np

	from sklearn.base import ClassifierMixin, clone
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import StratifiedKFold, cross_val_predict
	from sklearn.utils import safe_indexing

	from imblearn.under_sampling import InstanceHardnessThreshold
	from imblearn.utils.deprecation import deprecate_parameter

	class FineGrainedIH(InstanceHardnessThreshold):
	def _validate_estimator(self):
	"""Private function to create the classifier"""

	if (self.estimator is not None and
	isinstance(self.estimator, ClassifierMixin) and
	hasattr(self.estimator, 'predict_proba')):
	self.estimator_ = clone(self.estimator)
	self.estimator_.set_params(random_state=self.random_state)
	if 'n_jobs' in self.estimator_.get_params().keys():
	self.estimator_.set_params(n_jobs = self.n_jobs)

	elif self.estimator is None:
	self.estimator_ = RandomForestClassifier(
	n_estimators=100, random_state=self.random_state,
	n_jobs=self.n_jobs)
	else:
	raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
	type(self.estimator)))



	class CoarseGrainedIH(InstanceHardnessThreshold):
	def _validate_estimator(self):
	"""Private function to create the classifier"""

	if (self.estimator is not None and
	isinstance(self.estimator, ClassifierMixin) and
	hasattr(self.estimator, 'predict_proba')):
	self.estimator_ = clone(self.estimator)
	self.estimator_.set_params(random_state=self.random_state)
	if 'n_jobs' in self.estimator_.get_params().keys():
	self.estimator_.set_params(n_jobs = 1)

	elif self.estimator is None:
	self.estimator_ = RandomForestClassifier(
	n_estimators=100, random_state=self.random_state,
	n_jobs=self.n_jobs)
	else:
	raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
	type(self.estimator)))

	def _fit_resample(self, X, y):
	if self.return_indices:
	deprecate_parameter(self, '0.4', 'return_indices',
	'sample_indices_')
	self._validate_estimator()

	target_stats = Counter(y)
	skf = StratifiedKFold(
	n_splits=self.cv, shuffle=False,
	random_state=self.random_state)
	probabilities = cross_val_predict(self.estimator_, X, y, cv=skf,
	n_jobs=self.n_jobs, method='predict_proba')
	probabilities = probabilities[range(len(y)), y]

	idx_under = np.empty((0,), dtype=int)

	for target_class in np.unique(y):
	if target_class in self.sampling_strategy_.keys():
	n_samples = self.sampling_strategy_[target_class]
	threshold = np.percentile(
	probabilities[y == target_class],
	(1. - (n_samples / target_stats[target_class])) * 100.)
	index_target_class = np.flatnonzero(
	probabilities[y == target_class] >= threshold)
	else:
	index_target_class = slice(None)

	idx_under = np.concatenate(
	(idx_under,
	np.flatnonzero(y == target_class)[index_target_class]),
	axis=0)

	self.sample_indices_ = idx_under

	if self.return_indices:
	return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
	idx_under)
	return safe_indexing(X, idx_under), safe_indexing(y, idx_under)

	def avg_time(est,X,y):
	start = perf_counter()
	for _ in range(10):
	est.fit_resample(X,y)
	return (perf_counter() - start)/10


	if __name__=='__main__':
	from sklearn.datasets import load_digits,load_iris,load_breast_cancer
	SEED = 42
	rf = RandomForestClassifier(n_estimators=1000,random_state=SEED)
	coarse_ih = CoarseGrainedIH(estimator=rf,n_jobs=4,random_state = SEED)
	fine_ih = FineGrainedIH(estimator=rf,n_jobs=4,random_state=SEED)

	for name,dataset in zip(['digits','iris','cancer'],[load_digits,load_iris,load_breast_cancer]):
	X,y = dataset(return_X_y=True)
	print(name)
	print(f"Coarse:",avg_time(coarse_ih,X,y))
	print(f"Fine:",avg_time(fine_ih,X,y))