Skip to content

Instantly share code, notes, and snippets.

@dmarx
Last active August 26, 2018 09:48
Show Gist options
  • Select an option

  • Save dmarx/5c957d99e10b34d87161f5bb2cedbed9 to your computer and use it in GitHub Desktop.

Select an option

Save dmarx/5c957d99e10b34d87161f5bb2cedbed9 to your computer and use it in GitHub Desktop.
Demonstration of using a simple "two stage" strategy to handle classification that supports an "unknown" class that may contain arbitrarily many true subclasses.
library(MASS)
library(rpart)
random_pos_def_matrix = function(n){
# https://math.stackexchange.com/a/358092/109246
A = matrix(runif(n*n), n)
A = 0.5*(A+t(A))
A + n*diag(n)
}
random_cluster = function(n, mu){
m = length(mu)
s = random_pos_def_matrix(m)
mvrnorm(n, mu, s)
}
generate_clusters = function(n_clust, n_dim, n_obs, p=NULL, minv=0, maxv=10){
if (is.null(p)){
p = rep(1/n_clust, n_clust)
}
n = ceiling(n_obs*p)
d = sum(n) %% n_dim
n = n-d
outv = matrix(0,n_obs, n_dim+1)
i1=0
for(i in 1:n_clust){
i0 = i1+1
i1 = i1 + n[1]
mu = sample(minv:maxv, n_dim)
m = random_cluster(n[i], mu)
m = cbind(m, i)
outv[i0:i1,] = m
}
outv = outv[outv[,n_dim+1]>0,]
idx = sample(nrow(outv), nrow(outv))
outv = data.frame(outv[idx,])
colnames(outv)[n_dim+1] = 'y'
outv
}
#########################################
fit_base_models = function(X,y){
# assumes unknowns are assigned a class index less than 1
models = list()
for(i in 1:max(y)){
y_i = y==i
mod_i = glm(y_i ~ ., data=X, family="binomial")
models[[i]] = list(mod_i)
}
models
}
predict_base_models = function(models, X){
n_mod = length(models)
outv = matrix(0, nrow(X), n_mod)
for(i in 1:n_mod){
mod_i = models[[i]][[1]]
outv[,i] = predict(mod_i, X, type="response")
}
outv
}
fit_two_stage_model = function(X,y, unk_class_id=-1){
y = as.numeric(y)
y[y==unk_class_id] = -1
# Fit first stage
models = fit_base_models(X, y)
X2 = predict_base_models(models, X)
# Fit second stage
X3 = data.frame(X2)
X3$y = as.factor(y)
mod_ensemble = rpart(y ~ ., data=X3, method="class")
list(s1=models, s2=mod_ensemble)
}
predict_two_stage_model = function(model, X){
X2 = predict_base_models(model$s1, X)
predict(model$s2, data.frame(X2))
}
#####################################################3
n_clust = 20
n_dim = 2
n_obs = 500
n_known = floor(n_clust/2)
perc_train = .5
#####################################################
set.seed(123)
m = generate_clusters(n_clust, n_dim, n_obs, maxv=50)
idx = sample(n_obs, n_obs)
idx = idx[1:floor(n_obs*perc_train)]
m_train = m[ idx,]
m_test = m[-idx,]
X_train = m_train[,1:n_dim]
X_test = m_test[ ,1:n_dim]
y_train = m_train[,1+n_dim]
y_test = m_test[ ,1+n_dim]
y_train_i = y_train[]
y_train_i[y_train>n_known] = -1
y_test_i = y_test[]
y_test_i[y_test>n_known] = -1
#####################################################
mod = fit_two_stage_model(X_train,y_train_i)
Y_score_test = predict_two_stage_model(mod, X_test)
y_p_test = apply(Y_score_test, 1, which.max) - 1
y_p_test[y_p_test<1] = -1
mean(y_p_test == y_test_i) # 0.74
table(y_p_test, y_test)
table(y_p_test, y_test_i)
plot(m[,1:n_dim], col=m[,n_dim+1], main='True clusters')
plot(X_test, col=apply(Y_score_test, 1, which.max), main='Predicted classes')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment