dmarx · August 26, 2018 09:48
diff --git a/staged_classifiers_for_unknown_classes.r b/staged_classifiers_for_unknown_classes.r
 library(MASS)
 library(rpart)

 random_pos_def_matrix = function(n){
 	# https://math.stackexchange.com/a/358092/109246
 	A = matrix(runif(n*n), n)
 	A = 0.5*(A+t(A))
 	A + n*diag(n)
 }

 random_cluster = function(n, mu){
 	m = length(mu)
 	s = random_pos_def_matrix(m)
 	mvrnorm(n, mu, s)
 }

 generate_clusters = function(n_clust, n_dim, n_obs, p=NULL, minv=0, maxv=10){
 	if (is.null(p)){
 		p = rep(1/n_clust, n_clust)
 	}
 	n = ceiling(n_obs*p)
 	
 	d = sum(n) %% n_dim
 	n = n-d
 	
 	outv = matrix(0,n_obs, n_dim+1)
 	i1=0
 	for(i in 1:n_clust){
 		i0 = i1+1
 		i1 = i1 + n[1]
 		mu = sample(minv:maxv, n_dim) 
 		m = random_cluster(n[i], mu) 
 		m = cbind(m, i)
 		outv[i0:i1,] = m 
 	}
 	outv = outv[outv[,n_dim+1]>0,]
 	idx = sample(nrow(outv), nrow(outv))
 	outv = data.frame(outv[idx,])
 	colnames(outv)[n_dim+1] = 'y'
 	outv
 }

 #########################################

 fit_base_models = function(X,y){
 	# assumes unknowns are assigned a class index less than 1 
 	models = list()
 	for(i in 1:max(y)){
 		y_i = y==i 
 		mod_i = glm(y_i ~ ., data=X, family="binomial")
 		models[[i]] = list(mod_i)
 	}
 	models
 }

 predict_base_models = function(models, X){
 	n_mod = length(models)
 	outv = matrix(0, nrow(X), n_mod)
 	for(i in 1:n_mod){
 		mod_i = models[[i]][[1]] 
 		outv[,i] = predict(mod_i, X, type="response") 
 	}
 	outv 
 }

 fit_two_stage_model = function(X,y, unk_class_id=-1){
  y = as.numeric(y)
  y[y==unk_class_id] = -1
  
  # Fit first stage
  models = fit_base_models(X, y) 
  X2 = predict_base_models(models, X)
  
  # Fit second stage
  X3 = data.frame(X2) 
  X3$y = as.factor(y) 
  mod_ensemble = rpart(y ~ ., data=X3, method="class")
  list(s1=models, s2=mod_ensemble)
 }

 predict_two_stage_model = function(model, X){
  X2 = predict_base_models(model$s1, X)
  predict(model$s2, data.frame(X2))
 }

 #####################################################3

 n_clust = 20 
 n_dim = 2
 n_obs = 500 
 n_known = floor(n_clust/2)
 perc_train = .5

 #####################################################

 set.seed(123)
 m = generate_clusters(n_clust, n_dim, n_obs, maxv=50) 

 idx = sample(n_obs, n_obs)
 idx = idx[1:floor(n_obs*perc_train)] 
 m_train = m[ idx,] 
 m_test  = m[-idx,]

 X_train = m_train[,1:n_dim]
 X_test  = m_test[ ,1:n_dim]
 y_train = m_train[,1+n_dim]
 y_test  = m_test[ ,1+n_dim]

 y_train_i = y_train[] 
 y_train_i[y_train>n_known] = -1
 y_test_i = y_test[] 
 y_test_i[y_test>n_known] = -1

 #####################################################

 mod = fit_two_stage_model(X_train,y_train_i)
 Y_score_test = predict_two_stage_model(mod, X_test)

 y_p_test = apply(Y_score_test, 1, which.max) - 1
 y_p_test[y_p_test<1] = -1

 mean(y_p_test == y_test_i) # 0.74
 table(y_p_test, y_test)
 table(y_p_test, y_test_i)

 plot(m[,1:n_dim], col=m[,n_dim+1], main='True clusters')
 plot(X_test, col=apply(Y_score_test, 1, which.max), main='Predicted classes')
	library(MASS)
	library(rpart)

	random_pos_def_matrix = function(n){
	# https://math.stackexchange.com/a/358092/109246
	A = matrix(runif(n*n), n)
	A = 0.5*(A+t(A))
	A + n*diag(n)
	}

	random_cluster = function(n, mu){
	m = length(mu)
	s = random_pos_def_matrix(m)
	mvrnorm(n, mu, s)
	}

	generate_clusters = function(n_clust, n_dim, n_obs, p=NULL, minv=0, maxv=10){
	if (is.null(p)){
	p = rep(1/n_clust, n_clust)
	}
	n = ceiling(n_obs*p)

	d = sum(n) %% n_dim
	n = n-d

	outv = matrix(0,n_obs, n_dim+1)
	i1=0
	for(i in 1:n_clust){
	i0 = i1+1
	i1 = i1 + n[1]
	mu = sample(minv:maxv, n_dim)
	m = random_cluster(n[i], mu)
	m = cbind(m, i)
	outv[i0:i1,] = m
	}
	outv = outv[outv[,n_dim+1]>0,]
	idx = sample(nrow(outv), nrow(outv))
	outv = data.frame(outv[idx,])
	colnames(outv)[n_dim+1] = 'y'
	outv
	}

	#########################################

	fit_base_models = function(X,y){
	# assumes unknowns are assigned a class index less than 1
	models = list()
	for(i in 1:max(y)){
	y_i = y==i
	mod_i = glm(y_i ~ ., data=X, family="binomial")
	models[[i]] = list(mod_i)
	}
	models
	}

	predict_base_models = function(models, X){
	n_mod = length(models)
	outv = matrix(0, nrow(X), n_mod)
	for(i in 1:n_mod){
	mod_i = models[[i]][[1]]
	outv[,i] = predict(mod_i, X, type="response")
	}
	outv
	}

	fit_two_stage_model = function(X,y, unk_class_id=-1){
	y = as.numeric(y)
	y[y==unk_class_id] = -1

	# Fit first stage
	models = fit_base_models(X, y)
	X2 = predict_base_models(models, X)

	# Fit second stage
	X3 = data.frame(X2)
	X3$y = as.factor(y)
	mod_ensemble = rpart(y ~ ., data=X3, method="class")
	list(s1=models, s2=mod_ensemble)
	}

	predict_two_stage_model = function(model, X){
	X2 = predict_base_models(model$s1, X)
	predict(model$s2, data.frame(X2))
	}

	#####################################################3

	n_clust = 20
	n_dim = 2
	n_obs = 500
	n_known = floor(n_clust/2)
	perc_train = .5

	#####################################################

	set.seed(123)
	m = generate_clusters(n_clust, n_dim, n_obs, maxv=50)

	idx = sample(n_obs, n_obs)
	idx = idx[1:floor(n_obs*perc_train)]
	m_train = m[ idx,]
	m_test = m[-idx,]

	X_train = m_train[,1:n_dim]
	X_test = m_test[ ,1:n_dim]
	y_train = m_train[,1+n_dim]
	y_test = m_test[ ,1+n_dim]

	y_train_i = y_train[]
	y_train_i[y_train>n_known] = -1
	y_test_i = y_test[]
	y_test_i[y_test>n_known] = -1

	#####################################################

	mod = fit_two_stage_model(X_train,y_train_i)
	Y_score_test = predict_two_stage_model(mod, X_test)

	y_p_test = apply(Y_score_test, 1, which.max) - 1
	y_p_test[y_p_test<1] = -1

	mean(y_p_test == y_test_i) # 0.74
	table(y_p_test, y_test)
	table(y_p_test, y_test_i)

	plot(m[,1:n_dim], col=m[,n_dim+1], main='True clusters')
	plot(X_test, col=apply(Y_score_test, 1, which.max), main='Predicted classes')
No results found