Last active
December 22, 2015 14:49
-
-
Save nullren/6488541 to your computer and use it in GitHub Desktop.
in a textbook, there was an example i recreated. first step was to create a random scattering of two populations in such a way that there would be some chance of correlation. the second step was to use two methods of classifying the population to predict the color of other members: least squares (the line separates red and blue populations); N-n…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # create random scattering of points, but cluster them together so | |
| # that it at least looks like there are something to seperate or | |
| # correlate. could have used rmvnorm probably, but did not know before | |
| # i already finished. | |
| nCenters <- 2 | |
| nPointsToGen <- 100 | |
| nNearNeighbor <- 5 | |
| # create 2 sets of 10 points (x,y). these will be the centers of the | |
| # clusters generated. | |
| m <- rnorm(4 * nCenters) | |
| dim(m) <- c(nCenters,4) | |
| # need to choose the centers at random. we want to generate 100 | |
| # points for each color. | |
| sb <- sample(nrow(m), nPointsToGen, replace=T) | |
| sr <- sample(nrow(m), nPointsToGen, replace=T) | |
| d <- matrix(0, nPointsToGen, 4) | |
| # loop through the selected centers and generate a new random point | |
| # close to it. | |
| for(i in seq(sb)){ | |
| for(j in c(1,2)){ | |
| d[i,j] <- rnorm(1,mean=m[sb[i],j],sd=.2) | |
| } | |
| for(j in c(3,4)){ | |
| d[i,j] <- rnorm(1,mean=m[sr[i],j],sd=.2) | |
| } | |
| } | |
| # create data frames with appropriate values (X, Y, 1, Z, color) | |
| # for plane equation Z = aX + bY + c | |
| t <- data.frame(d[,1],d[,2], rep(1,nrow(d)), rep(0,nrow(d)), rep("blue",nrow(d))) | |
| s <- data.frame(d[,3],d[,4], rep(1,nrow(d)), rep(1,nrow(d)), rep("red",nrow(d))) | |
| colnames(t) <- colnames(s) <- c("x", "y", "1", "value", "color") | |
| # combine into one data.frame | |
| points <- merge(t,s,all=T) | |
| # plot them | |
| plot(points$x, points$y, col=as.character(points$color)) | |
| # blue circles represent Z = 0 and red Z = 1 | |
| # so then we fit a plane to these points, then draw a line where | |
| # Z = .5. this should approximately "separate" the red and blue | |
| # circles. | |
| X <- as.matrix(points[,1:3]) | |
| Y <- as.matrix(points[,4]) | |
| B <- solve(t(X)%*%X) %*% t(X) %*% Y | |
| # need the equation of the line Y = bX + a | |
| # .5 = X B_1 + Y B_2 + B_3, solve Y | |
| # Y = (-B_1/B_2) X + ((.5 - B_3)/B_2) | |
| abline(a = (.5 - B[3,1])/B[2,1], b = -B[1,1]/B[2,1]) | |
| # nearest neighbor classifier: color the background of the plot with | |
| # squares that identify the region. | |
| ys <- seq(min(points$y),max(points$y),.1) | |
| xs <- seq(min(points$x),max(points$x),.1) | |
| hm <- matrix(0, length(ys), length(xs)) | |
| # for each (x,y), find N nearest points | |
| N <- nNearNeighbor | |
| for(x in xs){ for(y in ys){ | |
| # find distance to all points | |
| points$tmp <- apply(points,1,function(row) ((as.numeric(row[1])-x)^2+(as.numeric(row[2])-y)^2)) | |
| # select top N | |
| t <- head(points[with(points,order(tmp,x,y)),],n=N) | |
| # color it | |
| if(mean(t$value)>.5){ | |
| points(x,y, col=rgb(255,0,0,50,maxColorValue=255), pch=15) | |
| }else{ | |
| points(x,y, col=rgb(0,0,255,50,maxColorValue=255), pch=15) | |
| } | |
| }} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment