Last active
August 26, 2018 09:48
-
-
Save dmarx/5c957d99e10b34d87161f5bb2cedbed9 to your computer and use it in GitHub Desktop.
Demonstration of using a simple "two stage" strategy to handle classification that supports an "unknown" class that may contain arbitrarily many true subclasses.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(MASS) | |
| library(rpart) | |
| random_pos_def_matrix = function(n){ | |
| # https://math.stackexchange.com/a/358092/109246 | |
| A = matrix(runif(n*n), n) | |
| A = 0.5*(A+t(A)) | |
| A + n*diag(n) | |
| } | |
| random_cluster = function(n, mu){ | |
| m = length(mu) | |
| s = random_pos_def_matrix(m) | |
| mvrnorm(n, mu, s) | |
| } | |
| generate_clusters = function(n_clust, n_dim, n_obs, p=NULL, minv=0, maxv=10){ | |
| if (is.null(p)){ | |
| p = rep(1/n_clust, n_clust) | |
| } | |
| n = ceiling(n_obs*p) | |
| d = sum(n) %% n_dim | |
| n = n-d | |
| outv = matrix(0,n_obs, n_dim+1) | |
| i1=0 | |
| for(i in 1:n_clust){ | |
| i0 = i1+1 | |
| i1 = i1 + n[1] | |
| mu = sample(minv:maxv, n_dim) | |
| m = random_cluster(n[i], mu) | |
| m = cbind(m, i) | |
| outv[i0:i1,] = m | |
| } | |
| outv = outv[outv[,n_dim+1]>0,] | |
| idx = sample(nrow(outv), nrow(outv)) | |
| outv = data.frame(outv[idx,]) | |
| colnames(outv)[n_dim+1] = 'y' | |
| outv | |
| } | |
| ######################################### | |
| fit_base_models = function(X,y){ | |
| # assumes unknowns are assigned a class index less than 1 | |
| models = list() | |
| for(i in 1:max(y)){ | |
| y_i = y==i | |
| mod_i = glm(y_i ~ ., data=X, family="binomial") | |
| models[[i]] = list(mod_i) | |
| } | |
| models | |
| } | |
| predict_base_models = function(models, X){ | |
| n_mod = length(models) | |
| outv = matrix(0, nrow(X), n_mod) | |
| for(i in 1:n_mod){ | |
| mod_i = models[[i]][[1]] | |
| outv[,i] = predict(mod_i, X, type="response") | |
| } | |
| outv | |
| } | |
| fit_two_stage_model = function(X,y, unk_class_id=-1){ | |
| y = as.numeric(y) | |
| y[y==unk_class_id] = -1 | |
| # Fit first stage | |
| models = fit_base_models(X, y) | |
| X2 = predict_base_models(models, X) | |
| # Fit second stage | |
| X3 = data.frame(X2) | |
| X3$y = as.factor(y) | |
| mod_ensemble = rpart(y ~ ., data=X3, method="class") | |
| list(s1=models, s2=mod_ensemble) | |
| } | |
| predict_two_stage_model = function(model, X){ | |
| X2 = predict_base_models(model$s1, X) | |
| predict(model$s2, data.frame(X2)) | |
| } | |
| #####################################################3 | |
| n_clust = 20 | |
| n_dim = 2 | |
| n_obs = 500 | |
| n_known = floor(n_clust/2) | |
| perc_train = .5 | |
| ##################################################### | |
| set.seed(123) | |
| m = generate_clusters(n_clust, n_dim, n_obs, maxv=50) | |
| idx = sample(n_obs, n_obs) | |
| idx = idx[1:floor(n_obs*perc_train)] | |
| m_train = m[ idx,] | |
| m_test = m[-idx,] | |
| X_train = m_train[,1:n_dim] | |
| X_test = m_test[ ,1:n_dim] | |
| y_train = m_train[,1+n_dim] | |
| y_test = m_test[ ,1+n_dim] | |
| y_train_i = y_train[] | |
| y_train_i[y_train>n_known] = -1 | |
| y_test_i = y_test[] | |
| y_test_i[y_test>n_known] = -1 | |
| ##################################################### | |
| mod = fit_two_stage_model(X_train,y_train_i) | |
| Y_score_test = predict_two_stage_model(mod, X_test) | |
| y_p_test = apply(Y_score_test, 1, which.max) - 1 | |
| y_p_test[y_p_test<1] = -1 | |
| mean(y_p_test == y_test_i) # 0.74 | |
| table(y_p_test, y_test) | |
| table(y_p_test, y_test_i) | |
| plot(m[,1:n_dim], col=m[,n_dim+1], main='True clusters') | |
| plot(X_test, col=apply(Y_score_test, 1, which.max), main='Predicted classes') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment