Last active
February 1, 2024 10:02
Revisions
-
fabianp revised this gist
Mar 19, 2015 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -104,7 +104,8 @@ def predict(self, X): Predict an ordering on X. For a list of n samples, this method returns a list from 0 to n-1 with the relative order of the rows of X. The item is given such that items ranked on top have are predicted a higher ordering (i.e. 0 means is the last item and n_samples would be the item ranked on top). Parameters ---------- -
fabianp revised this gist
Mar 19, 2015 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -103,6 +103,8 @@ def predict(self, X): """ Predict an ordering on X. For a list of n samples, this method returns a list from 0 to n-1 with the relative order of the rows of X. The item is given such that items ranked on top have are predicted a higher ordering. Parameters ---------- -
fabianp revised this gist
Mar 11, 2015 . 1 changed file with 3 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -96,6 +96,9 @@ def fit(self, X, y): super(RankSVM, self).fit(X_trans, y_trans) return self def decision_function(self, X): return np.dot(X, self.coef_.ravel()) def predict(self, X): """ Predict an ordering on X. For a list of n samples, this method -
fabianp revised this gist
Mar 9, 2015 . 1 changed file with 8 additions and 2 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,8 +1,14 @@ """ Implementation of pairwise ranking using scikit-learn LinearSVC Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich, T. Graepel, K. Obermayer 1999 "Learning to rank from medical imaging data." Pedregosa, Fabian, et al., Machine Learning in Medical Imaging 2012. Authors: Fabian Pedregosa <fabian@fseoane.net> Alexandre Gramfort <alexandre.gramfort@inria.fr> -
fabianp revised this gist
Mar 8, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -106,7 +106,7 @@ def predict(self, X): the rows in X. """ if hasattr(self, 'coef_'): return np.argsort(np.dot(X, self.coef_.ravel())) else: raise ValueError("Must call fit() prior to predict()") -
fabianp revised this gist
Mar 8, 2015 . 1 changed file with 40 additions and 163 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,7 +13,6 @@ import itertools import numpy as np from sklearn import svm, linear_model, cross_validation @@ -44,51 +43,35 @@ def transform_pairwise(X, y): Data as pairs y_trans : array, shape (k,) Output class labels, where classes have values {-1, +1} """ X_new = [] y_new = [] y = np.asarray(y) if y.ndim == 1: y = np.c_[y, np.ones(y.shape[0])] comb = itertools.combinations(range(X.shape[0]), 2) for k, (i, j) in enumerate(comb): if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]: # skip if same target or different group continue X_new.append(X[i] - X[j]) y_new.append(np.sign(y[i, 0] - y[j, 0])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] X_new[-1] = - X_new[-1] return np.asarray(X_new), np.asarray(y_new).ravel() class RankSVM(svm.LinearSVC): """Performs pairwise ranking with an underlying LinearSVC model Input should be a n-class ranking problem, this object will convert it into a two-class classification problem, a setting known as `pairwise ranking`. See object :ref:`svm.LinearSVC` for a full description of parameters. """ def fit(self, X, y): """ @@ -98,21 +81,13 @@ def fit(self, X, y): ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) or (n_samples, 2) Returns ------- self """ X_trans, y_trans = transform_pairwise(X, y) super(RankSVM, self).fit(X_trans, y_trans) return self def predict(self, X): @@ -131,147 +106,49 @@ def predict(self, X): the rows in X. """ if hasattr(self, 'coef_'): return np.argsort(np.dot(X, self.coef_.T)) else: raise ValueError("Must call fit() prior to predict()") def score(self, X, y): """ Because we transformed into a pairwise problem, chance level is at 0.5 """ X_trans, y_trans = transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) if __name__ == '__main__': # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression np.random.seed(1) n_samples, n_features = 300, 5 true_coef = np.random.randn(n_features) X = np.random.randn(n_samples, n_features) noise = np.random.randn(n_samples) / np.linalg.norm(true_coef) y = np.dot(X, true_coef) y = np.arctan(y) # add non-linearities y += .1 * noise # add noise Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id cv = cross_validation.KFold(n_samples, 5) train, test = iter(cv).next() # make a simple plot out of it import pylab as pl pl.scatter(np.dot(X, true_coef), y) pl.title('Data to be learned') pl.xlabel('<X, coef>') pl.ylabel('y') pl.show() # print the performance of ranking rank_svm = RankSVM().fit(X[train], Y[train]) print 'Performance of ranking ', rank_svm.score(X[test], Y[test]) # and that of linear regression ridge = linear_model.RidgeCV(fit_intercept=True) ridge.fit(X[train], y[train]) X_test_trans, y_test_trans = transform_pairwise(X[test], y[test]) score = np.mean(np.sign(np.dot(X_test_trans, ridge.coef_)) == y_test_trans) print 'Performance of linear regression ', score -
fabianp revised this gist
Mar 7, 2015 . 1 changed file with 3 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,6 +6,9 @@ Authors: Fabian Pedregosa <fabian@fseoane.net> Alexandre Gramfort <alexandre.gramfort@inria.fr> See also https://github.com/fabianp/pysofia for a more efficient implementation of RankSVM using stochastic gradient descent methdos. """ import itertools -
Fabian Pedregosa revised this gist
May 24, 2012 . 1 changed file with 11 additions and 40 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -78,15 +78,16 @@ class RankSVM(svm.SVC): See object :ref:`svm.SVC` for a full description of parameters. """ def __init__(self, weight_cb=None, C=1.0, degree=3, shrinking=True, probability=False, tol=1e-3, cache_size=200): self.weight_cb = weight_cb super(RankSVM, self).__init__(kernel='linear', degree=degree, tol=tol, C=C, shrinking=shrinking, probability=probability, cache_size=cache_size) def fit(self, X, y): """ Fit a pairwise ranking model. @@ -103,8 +104,10 @@ def fit(self, X, y, sample_weight=True): self """ X_trans, y_trans, diff = transform_pairwise(X, y) if self.weight_cb is not None: assert hasattr(self.weight_cb, '__call__') diff = self.weight_cb(diff) super(RankSVM, self).fit(X_trans, y_trans, sample_weight=diff) else: super(RankSVM, self).fit(X_trans, y_trans) return self @@ -191,6 +194,7 @@ def score(self, X, y): def linear_rank(X, y, alphas): """ Fit athe "value-regularized" linear method described in the paper "On the consistency of Ranking Algorithms", Duchi et al. @@ -226,10 +230,6 @@ def test_linear_rank(): class LinearRankCV(linear_model.base.LinearModel): def __init__(self, alphas): self.alphas = alphas self.fit_intercept = True @@ -271,33 +271,4 @@ def transform(self, X): # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression pass -
Fabian Pedregosa revised this gist
Apr 18, 2012 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -191,7 +191,6 @@ def score(self, X, y): def linear_rank(X, y, alphas): """ Fit athe "value-regularized" linear method described in the paper "On the consistency of Ranking Algorithms", Duchi et al. -
Fabian Pedregosa revised this gist
Apr 18, 2012 . 1 changed file with 4 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -227,6 +227,10 @@ def test_linear_rank(): class LinearRankCV(linear_model.base.LinearModel): """ value-regularized linear method described in the paper "On the consistency of Ranking Algorithms", Duchi et al. """ def __init__(self, alphas): self.alphas = alphas self.fit_intercept = True -
Fabian Pedregosa revised this gist
Apr 18, 2012 . 1 changed file with 30 additions and 30 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -268,33 +268,33 @@ def transform(self, X): # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression np.random.seed(0) n_samples, n_features = 300, 5 true_coef = np.random.randn(n_features) X = np.random.randn(n_samples, n_features) noise = np.random.randn(n_samples) / np.linalg.norm(true_coef) y = np.dot(X, true_coef) y = np.sqrt(y - np.min(y)) # add non-linearities y += .1 * noise # add noise Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id cv = cross_validation.KFold(n_samples, 5) train, test = iter(cv).next() # make a simple plot out of it import pylab as pl pl.scatter(np.dot(X, true_coef), y) pl.title('Data to be learned') pl.xlabel('<X, coef>') pl.ylabel('y') pl.show() # print the performance of ranking rank_svm = RankSVM().fit(X[train], Y[train]) print 'Performance of ranking ', rank_svm.score(X[test], Y[test]) # and that of linear regression ridge = linear_model.RidgeCV(fit_intercept=False) ridge.fit(X[train], y[train]) X_test_trans, y_test_trans, diff = transform_pairwise(X[test], y[test]) score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) print 'Performance of linear regression ', score -
Fabian Pedregosa revised this gist
Apr 18, 2012 . 1 changed file with 189 additions and 40 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,6 +10,7 @@ import itertools import numpy as np from scipy import linalg from sklearn import svm, linear_model, cross_validation @@ -40,51 +41,72 @@ def transform_pairwise(X, y): Data as pairs y_trans : array, shape (k,) Output class labels, where classes have values {-1, +1} diff: array target difference for each considered samples """ X_new = [] y_new = [] diff = [] y = np.asarray(y) if y.ndim == 1: y = np.c_[y, np.ones(y.shape[0])] comb = itertools.combinations(range(X.shape[0]), 2) k = 0 for (i, j) in comb: #if np.abs(y[i, 0] - y[j, 0]) <= 1. or y[i, 1] != y[j, 1]: if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]: # skip if same target or different group continue X_new.append(X[i] - X[j]) diff.append(y[i, 0] - y[j, 0]) y_new.append(np.sign(diff[-1])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] X_new[-1] = - X_new[-1] diff[-1] = - diff[-1] k += 1 return np.asarray(X_new), np.asarray(y_new).ravel(), np.array(diff).ravel() class RankSVM(svm.SVC): """Performs pairwise ranking with an underlying LinearSVC model Input should be a n-class ranking problem, this object will convert it into a two-class classification problem, a setting known as `pairwise ranking`. See object :ref:`svm.SVC` for a full description of parameters. """ def __init__(self, C=1.0, degree=3, shrinking=True, probability=False, tol=1e-3, cache_size=200, scale_C=True): super(RankSVM, self).__init__(kernel='linear', degree=degree, tol=tol, C=C, shrinking=shrinking, probability=probability, cache_size=cache_size, scale_C=scale_C) def fit(self, X, y, sample_weight=True): """ Fit a pairwise ranking model. Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) or (n_samples, 2) sample_weights : boolean wether to consider sample weights in the ranking problem (= weighted loss function) Returns ------- self """ X_trans, y_trans, diff = transform_pairwise(X, y) if sample_weight: super(RankSVM, self).fit(X_trans, y_trans, sample_weight=20 * np.abs(diff)) else: super(RankSVM, self).fit(X_trans, y_trans) return self def predict(self, X): @@ -109,43 +131,170 @@ def predict(self, X): def score(self, X, y): """ Because we transformed into a balanced pairwise problem, chance level is at 0.5 """ X_trans, y_trans, diff = transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) class RankLogistic(linear_model.LogisticRegression): def fit(self, X, y, sample_weight=False): """ Fit a pairwise ranking model. Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) or (n_samples, 2) sample_weights : boolean wether to consider sample weights in the ranking problem (= weighted loss function) Returns ------- self """ X_trans, y_trans, diff = transform_pairwise(X, y) if sample_weight: super(RankLogistic, self).fit(X_trans, y_trans, sample_weight=diff) else: super(RankLogistic, self).fit(X_trans, y_trans) return self def predict(self, X): """ Predict an ordering on X. For a list of n samples, this method returns a list from 0 to n-1 with the relative order of the rows of X. Parameters ---------- X : array, shape (n_samples, n_features) Returns ------- ord : array, shape (n_samples,) Returns a list of integers representing the relative order of the rows in X. """ if hasattr(self, 'coef_'): np.argsort(np.dot(X, self.coef_.T)) else: raise ValueError("Must call fit() prior to predict()") def score(self, X, y): """ Because we transformed into a balanced pairwise problem, chance level is at 0.5 """ X_trans, y_trans, diff = transform_pairwise(X, y) return np.mean(super(RankLogistic, self).predict(X_trans) == y_trans) def linear_rank(X, y, alphas): """ Fit athe "value-regularized" linear method described in the paper "On the consistency of Ranking Algorithms", Duchi et al. Parameters ---------- alpha: l2-term regularization """ alphas = np.array(alphas) U, s, Vt = linalg.svd(X, full_matrices=False) Xp, yp, d = transform_pairwise(X, y) #d = np.sign(d) * ((100 * d) ** 2) #omega = 1. / ( 1e3 * X.shape[0] ** 2) #omega = 1e-4 v = np.dot(Vt, np.dot(d, Xp)) return np.dot(v / ((s ** 2) + alphas[:, np.newaxis]), Vt) def test_linear_rank(): X = np.random.randn(5, 5) y = np.random.randn(5) alphas = [.1, 1., 2.] W = linear_rank(X, y, alphas) Xp, yp, d = transform_pairwise(X, y) # check KKT conditions for i in range(3): right = np.dot(np.dot(X.T, X) + alphas[i] * np.eye(5), W[i]) left = np.dot(Xp.T, d) assert np.allclose(right, left) print 'TESTS OK' class LinearRankCV(linear_model.base.LinearModel): def __init__(self, alphas): self.alphas = alphas self.fit_intercept = True def fit(self, X, y): X, y, X_mean, y_mean, X_std =\ self._center_data(X, y, self.fit_intercept, False, True) cv = cross_validation.KFold(X.shape[0], 5) scores = np.zeros(len(self.alphas)) for train, test in cv: Xp, yp, dp = transform_pairwise(X[train], y[train]) W = linear_rank(X[train], y[train], self.alphas) Xt, yt, dt = transform_pairwise(X[test], y[test]) if not yt.size: continue train_score = dp * np.dot(Xp, W.T).T assert train_score.shape == (len(self.alphas), Xp.shape[0]) #print(np.mean(dp * np.dot(Xp, W.T).T > 0, 1)) scores += np.mean(dt * np.dot(Xt, W.T).T > 0, 1) self.best_alpha = self.alphas[np.argmax(scores)] self.coef_ = linear_rank(X, y, [self.best_alpha])[0] # learn on whole dataset self._set_intercept(X_mean, y_mean, X_std) return self def score(self, X, y): Xt, yt, dt = transform_pairwise(X, y) return np.mean(np.sign(np.dot(Xt, self.coef_.T)) == yt) def transform(self, X): return np.dot(X, self.coef_)[:, np.newaxis] if __name__ == '__main__': # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression # np.random.seed(0) # n_samples, n_features = 300, 5 # true_coef = np.random.randn(n_features) # X = np.random.randn(n_samples, n_features) # noise = np.random.randn(n_samples) / np.linalg.norm(true_coef) # y = np.dot(X, true_coef) # y = np.sqrt(y - np.min(y)) # add non-linearities # y += .1 * noise # add noise # Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id # cv = cross_validation.KFold(n_samples, 5) # train, test = iter(cv).next() # # # make a simple plot out of it # import pylab as pl # pl.scatter(np.dot(X, true_coef), y) # pl.title('Data to be learned') # pl.xlabel('<X, coef>') # pl.ylabel('y') # pl.show() # # # print the performance of ranking # rank_svm = RankSVM().fit(X[train], Y[train]) # print 'Performance of ranking ', rank_svm.score(X[test], Y[test]) # # # and that of linear regression # ridge = linear_model.RidgeCV(fit_intercept=False) # ridge.fit(X[train], y[train]) # X_test_trans, y_test_trans = transform_pairwise(X[test], y[test]) # score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) # print 'Performance of linear regression ', score -
agramfort revised this gist
Mar 18, 2012 . 1 changed file with 3 additions and 2 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -127,6 +127,7 @@ def score(self, X, y): y = np.dot(X, true_coef) y = np.sqrt(y - np.min(y)) # add non-linearities y += .1 * noise # add noise Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id cv = cross_validation.KFold(n_samples, 5) train, test = iter(cv).next() @@ -139,8 +140,8 @@ def score(self, X, y): pl.show() # print the performance of ranking rank_svm = RankSVM().fit(X[train], Y[train]) print 'Performance of ranking ', rank_svm.score(X[test], Y[test]) # and that of linear regression ridge = linear_model.RidgeCV(fit_intercept=False) -
agramfort revised this gist
Mar 18, 2012 . 1 changed file with 63 additions and 59 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,7 +4,8 @@ Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich, T. Graepel, K. Obermayer. Authors: Fabian Pedregosa <fabian@fseoane.net> Alexandre Gramfort <alexandre.gramfort@inria.fr> """ import itertools @@ -13,12 +14,58 @@ from sklearn import svm, linear_model, cross_validation def transform_pairwise(X, y): """Transforms data into pairs with balanced labels for ranking Transforms a n-class ranking problem into a two-class classification problem. Subclasses implementing particular strategies for choosing pairs should override this method. In this method, all pairs are choosen, except for those that have the same target value. The output is an array of balanced classes, i.e. there are the same number of -1 as +1 Parameters ---------- X : array, shape (n_samples, n_features) The data y : array, shape (n_samples,) or (n_samples, 2) Target labels. If it's a 2D array, the second column represents the grouping of samples, i.e., samples with different groups will not be considered. Returns ------- X_trans : array, shape (k, n_feaures) Data as pairs y_trans : array, shape (k,) Output class labels, where classes have values {-1, +1} """ X_new = [] y_new = [] y = np.asarray(y) if y.ndim == 1: y = np.c_[y, np.ones(y.shape[0])] comb = itertools.combinations(range(X.shape[0]), 2) for k, (i, j) in enumerate(comb): if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]: # skip if same target or different group continue X_new.append(X[i] - X[j]) y_new.append(np.sign(y[i, 0] - y[j, 0])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] X_new[-1] = - X_new[-1] return np.asarray(X_new), np.asarray(y_new).ravel() class RankSVM(svm.LinearSVC): """Performs pairwise ranking with an underlying LinearSVC model Input should be a n-class ranking problem, this object will convert it into a two-class classification problem, a setting known as `pairwise ranking`. See object :ref:`svm.LinearSVC` for a full description of parameters. """ @@ -36,7 +83,7 @@ def fit(self, X, y): ------- self """ X_trans, y_trans = transform_pairwise(X, y) super(RankSVM, self).fit(X_trans, y_trans) return self @@ -64,65 +111,23 @@ def score(self, X, y): """ Because we transformed into a pairwise problem, chance level is at 0.5 """ X_trans, y_trans = transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) if __name__ == '__main__': # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression np.random.seed(0) n_samples, n_features = 300, 5 true_coef = np.random.randn(n_features) X = np.random.randn(n_samples, n_features) noise = np.random.randn(n_samples) / np.linalg.norm(true_coef) y = np.dot(X, true_coef) y = np.sqrt(y - np.min(y)) # add non-linearities y += .1 * noise # add noise cv = cross_validation.KFold(n_samples, 5) train, test = iter(cv).next() # make a simple plot out of it @@ -134,13 +139,12 @@ def transform_pairwise(self, X, y): pl.show() # print the performance of ranking rank_svm = RankSVM().fit(X[train], y[train]) print 'Performance of ranking ', rank_svm.score(X[test], y[test]) # and that of linear regression ridge = linear_model.RidgeCV(fit_intercept=False) ridge.fit(X[train], y[train]) X_test_trans, y_test_trans = transform_pairwise(X[test], y[test]) score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) print 'Performance of linear regression ', score -
Fabian Pedregosa revised this gist
Mar 18, 2012 . 1 changed file with 13 additions and 5 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -30,7 +30,7 @@ def fit(self, X, y): Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) or (n_samples, 2) Returns ------- @@ -80,7 +80,10 @@ def transform_pairwise(self, X, y): Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) or (n_samples, 2) it it's a 2D array, the second column represents the grouping of samples, i.e., samples with different groups will not be considered. Returns ------- @@ -90,13 +93,18 @@ def transform_pairwise(self, X, y): """ X_new = [] y_new = [] y = np.asarray(y) if y.ndim < 2: y = y[:, np.newaxis] if y.shape[1] < 2: y = np.c_[y, np.ones(y.shape[0])] comb = itertools.combinations(range(X.shape[0]), 2) for k, (i, j) in enumerate(comb): if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]: # skip if same target or different group continue X_new.append(X[i] - X[j]) y_new.append(np.sign(y[i, 0] - y[j, 0])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] -
Fabian Pedregosa revised this gist
Mar 12, 2012 . 1 changed file with 3 additions and 6 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -58,7 +58,7 @@ def predict(self, X): if hasattr(self, 'coef_'): np.argsort(np.dot(X, self.coef_.T)) else: raise ValueError("Must call fit() prior to predict()") def score(self, X, y): """ @@ -67,8 +67,6 @@ def score(self, X, y): X_trans, y_trans = self.transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) def transform_pairwise(self, X, y): """ Transforms a n-class ranking problem into a two-class classification @@ -106,7 +104,6 @@ def transform_pairwise(self, X, y): return np.asarray(X_new), np.asarray(y_new).ravel() if __name__ == '__main__': # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression @@ -116,7 +113,7 @@ def transform_pairwise(self, X, y): true_coef, X = np.random.randn(M), np.random.randn(N, M) noise = np.random.randn(X.shape[0]) / np.linalg.norm(true_coef) y = np.dot(X, true_coef) y = np.sqrt(y - np.min(y)) + .1 * noise # add noise and non-linearities cv = cross_validation.KFold(X.shape[0], 5) train, test = iter(cv).next() @@ -138,4 +135,4 @@ def transform_pairwise(self, X, y): ridge.fit(X[train], y[train]) X_test_trans, y_test_trans = rank_svm.transform_pairwise(X[test], y[test]) score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) print 'Performance of linear regression ', score -
Fabian Pedregosa revised this gist
Mar 12, 2012 . No changes.There are no files selected for viewing
-
Fabian Pedregosa revised this gist
Mar 12, 2012 . 1 changed file with 38 additions and 38 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -36,7 +36,7 @@ def fit(self, X, y): ------- self """ X_trans, y_trans = self.transform_pairwise(X, y) super(RankSVM, self).fit(X_trans, y_trans) return self @@ -64,46 +64,46 @@ def score(self, X, y): """ Because we transformed into a pairwise problem, chance level is at 0.5 """ X_trans, y_trans = self.transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) def transform_pairwise(self, X, y): """ Transforms a n-class ranking problem into a two-class classification problem. Subclasses implementing particular strategies for choosing pairs should override this method. In this method, all pairs are choosen, except for those that have the same target value. The output is an array of balanced classes, i.e. there are the same number of -1 as +1 Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) Returns ------- X_trans : array, shape (k, n_feaures) y_trans : array, shape (k,) Output class labels, where classes have values {-1, +1} """ X_new = [] y_new = [] comb = itertools.combinations(range(X.shape[0]), 2) for k, (i, j) in enumerate(comb): if y[i] == y[j]: # skip if same target continue X_new.append(X[i] - X[j]) y_new.append(np.sign(y[i] - y[j])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] X_new[-1] = - X_new[-1] return np.asarray(X_new), np.asarray(y_new).ravel() @@ -136,6 +136,6 @@ def transform_pairwise(X, y): # and that of linear regression ridge = linear_model.Ridge(fit_intercept=False) ridge.fit(X[train], y[train]) X_test_trans, y_test_trans = rank_svm.transform_pairwise(X[test], y[test]) score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) print 'Performance of linear regression ', score -
Fabian Pedregosa created this gist
Mar 12, 2012 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,141 @@ """ Implementation of pairwise ranking using scikit-learn LinearSVC Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich, T. Graepel, K. Obermayer. Author: Fabian Pedregosa <fabian@fseoane.net> """ import itertools import numpy as np from sklearn import svm, linear_model, cross_validation class RankSVM(svm.LinearSVC): """ Performs pairwise ranking with an underlying LinearSVC model. Input should be a n-class ranking problem, this object will convert it into a two-class classification problem, a setting known as `pairwise ranking`. See object :ref:`svm.LinearSVC` for a full description of parameters. """ def fit(self, X, y): """ Fit a pairwise ranking model. Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) Returns ------- self """ X_trans, y_trans = transform_pairwise(X, y) super(RankSVM, self).fit(X_trans, y_trans) return self def predict(self, X): """ Predict an ordering on X. For a list of n samples, this method returns a list from 0 to n-1 with the relative order of the rows of X. Parameters ---------- X : array, shape (n_samples, n_features) Returns ------- ord : array, shape (n_samples,) Returns a list of integers representing the relative order of the rows in X. """ if hasattr(self, 'coef_'): np.argsort(np.dot(X, self.coef_.T)) else: raise ValueError("Model has no coefficients, must call fit() prior to predict()") def score(self, X, y): """ Because we transformed into a pairwise problem, chance level is at 0.5 """ X_trans, y_trans = transform_pairwise(X, y) return np.mean(super(RankSVM, self).predict(X_trans) == y_trans) def transform_pairwise(X, y): """ Transforms a n-class ranking problem into a two-class classification problem. Subclasses implementing particular strategies for choosing pairs should override this method. In this method, all pairs are choosen, except for those that have the same target value. The output is an array of balanced classes, i.e. there are the same number of -1 as +1 Parameters ---------- X : array, shape (n_samples, n_features) y : array, shape (n_samples,) Returns ------- X_trans : array, shape (k, n_feaures) y_trans : array, shape (k,) Output class labels, where classes have values {-1, +1} """ X_new = [] y_new = [] comb = itertools.combinations(range(X.shape[0]), 2) for k, (i, j) in enumerate(comb): if y[i] == y[j]: # skip if same target continue X_new.append(X[i] - X[j]) y_new.append(np.sign(y[i] - y[j])) # output balanced classes if y_new[-1] != (-1) ** k: y_new[-1] = - y_new[-1] X_new[-1] = - X_new[-1] return np.asarray(X_new), np.asarray(y_new).ravel() if __name__ == '__main__': # as showcase, we will create some non-linear data # and print the performance of ranking vs linear regression np.random.seed(0) M, N = 5, 100 true_coef, X = np.random.randn(M), np.random.randn(N, M) noise = np.random.randn(X.shape[0]) / np.linalg.norm(true_coef) y = np.dot(X, true_coef) y = np.sqrt(y - np.min(y)) + .1 * noise # add noise and non-linearities cv = cross_validation.KFold(X.shape[0], 5) train, test = iter(cv).next() # make a simple plot out of it import pylab as pl pl.scatter(np.dot(X, true_coef), y) pl.title('Data to be learned') pl.xlabel('<X, coef>') pl.ylabel('y') pl.show() # print the performance of ranking from sklearn import cross_validation rank_svm = RankSVM().fit(X[train], y[train]) print 'Performance of ranking ', rank_svm.score(X[test], y[test]) # and that of linear regression ridge = linear_model.Ridge(fit_intercept=False) ridge.fit(X[train], y[train]) X_test_trans, y_test_trans = transform_pairwise(X[test], y[test]) score = np.mean(np.sign(ridge.predict(X_test_trans)) == y_test_trans) print 'Performance of linear regression ', score