Created
March 7, 2017 23:29
-
-
Save Erotemic/c9532be23e21a44a63d0cc77ed2e65fd to your computer and use it in GitHub Desktop.
toy nan rf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import utool as ut | |
| def get_toydata(rng): | |
| if ut.get_argflag('--toy2'): | |
| X_true, X, y = toydata2(rng) | |
| else: | |
| X_true, X, y = toydata1(rng) | |
| return X_true, X, y | |
| def toydata2(rng): | |
| from sklearn.datasets import samples_generator | |
| n_samples = 1000 | |
| n_features = 2 | |
| n_classes = 2 | |
| n_informative = 2 | |
| n_clusters_per_class = int((2 ** n_informative) // n_classes) | |
| hypercube = False | |
| samplekw = dict( | |
| flip_y=0.00, | |
| class_sep=1.0, | |
| shift=[-10, 10], | |
| scale=1.0, | |
| n_redundant=0, | |
| n_repeated=0, | |
| hypercube=hypercube, n_samples=n_samples, n_informative=n_informative, | |
| n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, | |
| weights=None, shuffle=True, n_features=n_features, random_state=rng) | |
| X_true, y = samples_generator.make_classification(**samplekw) | |
| with_extra = ut.get_argflag('--extra') | |
| # make very informative nan dimension | |
| if with_extra: | |
| n_informative_nan = 100 | |
| # extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[12, -8]]) | |
| extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[10, -12]]) | |
| X_true = np.vstack((X_true, extra_x)) | |
| y = np.append(y, [0] * n_informative_nan) | |
| # Randomly drop datapoints | |
| X = X_true.copy() | |
| nanrate = ut.get_argval('--nanrate', default=.01) | |
| if nanrate: | |
| # TODO: | |
| # * informative nan | |
| # * random nan | |
| # * random nan + informative nan | |
| X.ravel()[rng.rand(X.size) < nanrate] = np.nan | |
| if with_extra: | |
| if True: | |
| X.T[1][-n_informative_nan:] = np.nan | |
| else: | |
| X.T[0][-n_informative_nan:-n_informative_nan // 2] = np.nan | |
| X.T[1][-n_informative_nan // 2:] = np.nan | |
| return X_true, X, y | |
| def toydata1(rng): | |
| """ | |
| **Description of Plot** | |
| You'll notice that there are 4 plots. This is necessary to visualize a grid | |
| with nans. Each plot shows points in the 2-dimensional grid with corners at | |
| (0, 0) and (40, 40). The top left plot has these coordinates labeled. The | |
| other 3 plots correspond to the top left grid, but in these plots at least | |
| one of the dimensions has been "nanned". In the top right the x-dimension | |
| is "nanned". In the bottom left the y-dimension is "nanned", and in the | |
| bottom right both dimensions are "nanned". Even though all plots are drawn | |
| as a 2d-surface only the topleft plot is truly a surface with 2 degrees of | |
| freedom. The top right and bottom left plots are really lines with 1 degree | |
| of freedom, and the bottom right plot is actually just a single point with | |
| 0 degrees of freedom. | |
| In this example I create 10 Gaussian blobs where the first 9 have their | |
| means laid out in a 3x3 grid and the last one has its mean in the center, | |
| but I gave it a high standard deviation. I'll refer to the high std cluster | |
| as 9, and label the other clusters at the grid means (to agree with the | |
| demo code) like this: | |
| ``` | |
| 6 7 8 | |
| 3 4 5 | |
| 0 1 2 | |
| ``` | |
| Looking at the top left plot you can see clusters 0, 1, 2, 4, 6, and 8. The | |
| reason the other cluster do not appear in this grid is because I've set at | |
| least one of their dimensions to be nan. Specifically, cluster 3 had its y | |
| dimension set to nan; cluster 5 and 7 had their x dimension set to nan; and | |
| cluster 9 had both x and y dimensions set to nan. | |
| For clusters 3, 5, and 7, I plot "nanned" points as lines along the nanned | |
| dimension to show that only the non-nan dimensions can be used to | |
| distinguish these points. I also plot the original position before I | |
| "nanned" it for visualization purposes, but the learning algorithm never | |
| sees this. For cluster 9, I only plot the original positions because all of | |
| this data collapses to a single point [nan, nan]. | |
| Red points are of class 0, and blue points are of class 1. Points in each | |
| plot represent the training data. The colored background of each plot | |
| represents the classification surface. | |
| """ | |
| from sklearn.datasets import samples_generator | |
| import functools | |
| step = 20 | |
| n_samples = 100 | |
| blob = functools.partial(samples_generator.make_blobs, n_samples=n_samples, | |
| random_state=rng) | |
| Xy_blobs = [ | |
| (0, blob(centers=[[0 * step, 0 * step]])[0]), | |
| (1, blob(centers=[[1 * step, 0 * step]])[0]), | |
| (0, blob(centers=[[2 * step, 0 * step]])[0]), | |
| (1, blob(centers=[[0 * step, 1 * step]])[0]), | |
| (0, blob(centers=[[1 * step, 1 * step]])[0]), | |
| (0, blob(centers=[[2 * step, 1 * step]])[0]), | |
| (0, blob(centers=[[0 * step, 2 * step]])[0]), | |
| (1, blob(centers=[[1 * step, 2 * step]])[0]), | |
| (0, blob(centers=[[2 * step, 2 * step]])[0]), | |
| (1, blob(centers=[[1 * step, 1 * step]], cluster_std=5)[0]), | |
| ] | |
| X_blobs = [Xy[1] for Xy in Xy_blobs] | |
| X_true = np.vstack(X_blobs) | |
| y_blobs = [np.full(len(X), y_, dtype=np.int) for y_, X in Xy_blobs] | |
| # nanify some values | |
| if True: | |
| X_blobs[3][:, 1] = np.nan | |
| X_blobs[7][:, 0] = np.nan | |
| X_blobs[5][:, 0] = np.nan | |
| X_blobs[-1][:, :] = np.nan | |
| X = np.vstack(X_blobs) | |
| y = np.hstack(y_blobs) | |
| return X_true, X, y | |
| def show_nan_decision_function_2d(X, y, X_true, clf): | |
| import numpy as np | |
| print('Drawing') | |
| # Now plot the decision boundary using a fine mesh as input to a | |
| # filled contour plot | |
| plot_step = 1.0 | |
| x_min, x_max = X_true[:, 0].min() - 1, X_true[:, 0].max() + 1 | |
| y_min, y_max = X_true[:, 1].min() - 1, X_true[:, 1].max() + 1 | |
| xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), | |
| np.arange(y_min, y_max, plot_step)) | |
| yynan = np.full(yy.shape, fill_value=np.nan) | |
| xxnan = np.full(yy.shape, fill_value=np.nan) | |
| # Get prediction surface in the non-nan-zone | |
| Z_nonnan = clf.predict_proba( | |
| np.c_[xx.ravel(), yy.ravel()]).T[1].reshape(xx.shape) | |
| # Get prediction surface in the xnan-zone | |
| Z_xnan = clf.predict_proba( | |
| np.c_[xxnan.ravel(), yy.ravel()]).T[1].reshape(xx.shape) | |
| # Get prediction surface in the ynan-zone | |
| Z_ynan = clf.predict_proba( | |
| np.c_[xx.ravel(), yynan.ravel()]).T[1].reshape(xx.shape) | |
| # Get prediction surface for all-nan-zone | |
| Z_fullnan = clf.predict_proba( | |
| np.c_[xxnan.ravel(), yynan.ravel()]).T[1].reshape(xx.shape) | |
| is_nonnan = np.logical_and(~np.isnan(X.T[0]), ~np.isnan(X.T[1])) | |
| is_xnan = np.logical_and(np.isnan(X.T[0]), ~np.isnan(X.T[1])) | |
| is_ynan = np.logical_and(~np.isnan(X.T[0]), np.isnan(X.T[1])) | |
| is_fullnan = np.logical_and(np.isnan(X.T[0]), np.isnan(X.T[1])) | |
| # Draw surfaces and support points in different axes | |
| import matplotlib.gridspec as gridspec | |
| import matplotlib.pyplot as plt | |
| gs = gridspec.GridSpec(17, 17) | |
| pnum1 = (gs[0:8, 0:8],) | |
| pnum2 = (gs[0:8, 8:16],) | |
| pnum3 = (gs[9:17, 0:8],) | |
| pnum4 = (gs[9:17, 8:16],) | |
| fig = plt.figure() | |
| cmap = plt.cm.RdYlBu | |
| norm = plt.Normalize(vmin=0, vmax=1) | |
| sm = plt.cm.ScalarMappable(cmap=cmap) | |
| sm.set_array(np.linspace(0, 1)) | |
| color0 = cmap(0) | |
| print('color0 = %r' % (color0,)) | |
| color1 = cmap(1.0) | |
| print('color1 = %r' % (color1,)) | |
| def draw_line_segments(pts1, pts2, ax=None, **kwargs): | |
| import matplotlib as mpl | |
| if ax is None: | |
| ax = plt.gca() | |
| assert len(pts1) == len(pts2), 'unaligned' | |
| segments = [(xy1, xy2) for xy1, xy2 in zip(pts1, pts2)] | |
| linewidth = kwargs.pop('lw', kwargs.pop('linewidth', 1.0)) | |
| alpha = kwargs.pop('alpha', 1.0) | |
| line_group = mpl.collections.LineCollection(segments, linewidth, | |
| alpha=alpha, **kwargs) | |
| ax.add_collection(line_group) | |
| def draw_single_nan_lines(X_true, y, flags, nan_dim): | |
| if not np.any(flags): | |
| return | |
| nandim_min = np.nanmin(X_true.T[nan_dim]) | |
| nandim_max = np.nanmax(X_true.T[nan_dim]) | |
| num_dim = 1 - nan_dim # 2d only | |
| numdim_pts = X[flags].T[num_dim] | |
| pts1 = np.empty((flags.sum(), 2)) | |
| pts2 = np.empty((flags.sum(), 2)) | |
| pts1[:, nan_dim] = nandim_min | |
| pts2[:, nan_dim] = nandim_max | |
| pts1[:, num_dim] = numdim_pts | |
| pts2[:, num_dim] = numdim_pts | |
| y_ = y[flags] | |
| draw_line_segments(pts1[y_ == 0], pts2[y_ == 0], color=color0, linestyle='-', alpha=1.0) | |
| draw_line_segments(pts1[y_ == 1], pts2[y_ == 1], color=color1, linestyle='-', alpha=1.0) | |
| def draw_train_points(X_true, y, flags): | |
| plt.plot(X_true[flags].T[0][y[flags] == 0], X_true[flags].T[1][y[flags] == 0], 'o', color=color0, markeredgecolor='w') | |
| plt.plot(X_true[flags].T[0][y[flags] == 1], X_true[flags].T[1][y[flags] == 1], 'o', color=color1, markeredgecolor='w') | |
| def _contour(Z): | |
| plt.contourf(xx, yy, Z, cmap=cmap, norm=norm, alpha=1.0) | |
| fig.add_subplot(*pnum1) | |
| _contour(Z_nonnan) | |
| flags = is_nonnan | |
| draw_train_points(X_true, y, flags) | |
| plt.title('non-nan decision surface') | |
| plt.gca().set_aspect('equal') | |
| fig.add_subplot(*pnum2) | |
| _contour(Z_xnan) | |
| flags = is_xnan | |
| draw_train_points(X_true, y, flags) | |
| draw_single_nan_lines(X_true, y, flags, 0) | |
| plt.gca().set_xticks([]) | |
| plt.gca().set_xlabel('nan') | |
| plt.title('x-nan decision surface') | |
| plt.gca().set_aspect('equal') | |
| fig.add_subplot(*pnum3) | |
| _contour(Z_ynan) | |
| flags = is_ynan | |
| draw_train_points(X_true, y, flags) | |
| # make nan-lines | |
| draw_single_nan_lines(X_true, y, flags, 1) | |
| plt.title('y-nan decision surface') | |
| plt.gca().set_aspect('equal') | |
| plt.gca().set_yticks([]) | |
| plt.gca().set_ylabel('nan') | |
| fig.add_subplot(*pnum4) | |
| _contour(Z_fullnan) | |
| flags = is_fullnan | |
| draw_train_points(X_true, y, flags) | |
| plt.title('full-nan decision surface') | |
| plt.gca().set_aspect('equal') | |
| plt.gca().set_xticks([]) | |
| plt.gca().set_yticks([]) | |
| plt.gca().set_xlabel('nan') | |
| plt.gca().set_ylabel('nan') | |
| plt.gcf().suptitle('RandomForestClassifier With NaN decision criteria') | |
| gs = gridspec.GridSpec(1, 16) | |
| subspec = gs[:, -1:] | |
| cax = plt.subplot(subspec) | |
| plt.colorbar(sm, cax) | |
| cax.set_ylabel('probability class 1') | |
| new_subplotpars = fig.subplotpars.__dict__.copy() | |
| del new_subplotpars['validate'] | |
| new_subplotpars.update(left=.001, right=.9, top=.9, bottom=.05, hspace=1.0, wspace=1.0) | |
| plt.subplots_adjust(**new_subplotpars) | |
| def main(): | |
| r""" | |
| SeeAlso: | |
| python -m sklearn.ensemble.tests.test_forest test_multioutput | |
| CommandLine: | |
| python -m ibeis toy_classify_nans | |
| python -m ibeis toy_classify_nans --toy1 --save "rf_nan_toy1.jpg" --figsize=10,10 | |
| python -m ibeis toy_classify_nans --toy2 --save "rf_nan_toy2.jpg" --figsize=10,10 | |
| python -m ibeis toy_classify_nans --toy2 --save "rf_nan_toy3.jpg" --figsize=10,10 --extra | |
| python -m ibeis toy_classify_nans --toy2 --save "rf_nan_toy4.jpg" --figsize=10,10 --extra --nanrate=0 | |
| python -m ibeis toy_classify_nans --toy2 --save "rf_nan_toy5.jpg" --figsize=10,10 --nanrate=0 | |
| Example: | |
| >>> from ibeis.algo.hots.script_vsone import * # NOQA | |
| >>> result = toy_classify_nans() | |
| """ | |
| from sklearn.ensemble import RandomForestClassifier | |
| rng = np.random.RandomState(42) | |
| print('Creating test data') | |
| X_true, X, y = get_toydata(rng) | |
| assert len(X) == len(y) | |
| print('Fitting RF on %d points' % (len(X),)) | |
| # Train uncalibrated random forest classifier on train data | |
| clf = RandomForestClassifier(n_estimators=64, random_state=42, | |
| criterion='gini', | |
| missing_values=np.nan, bootstrap=False) | |
| # import pprint | |
| # pprint.pprint(clf.__dict__) | |
| clf.fit(X, y) | |
| # pprint.pprint(clf.__dict__) | |
| show_nan_decision_function_2d(X, y, X_true, clf) | |
| if __name__ == '__main__': | |
| r""" | |
| CommandLine: | |
| python -m ibeis.algo.hots.toy_nan_rf --show | |
| """ | |
| main() | |
| import matplotlib.pyplot as plt | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment