diamonaj · December 3, 2016 14:45
diff --git a/CS112 14.1 lalonde b/CS112 14.1 lalonde
 data(lalonde); library(randomForest)

 # create lalonde2 (just control units); delete orig data to avoid mistakeslalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)

 # eliminate the treatment indicator variable (they are all control units)# remove additional columns--we are going to predict "u75" (unemployed in '75)elimin.cols <- which( names(lalonde2) == "treat" |  names(lalonde2) == "re75"| names(lalonde2) == "re78")

 lalonde2 <- lalonde2[, -elimin.cols]

 # make the dependent variable (what we are trying to predict)# a factor, because random forest will perform classification# for factors, not for 'numeric' variables.# notice that I set the levels when I define this factor...# otherwise, by default, R would set the first factor value# of the first value appearing in the dataset as "0"# even though it is actually a "1"--this could be very confusing.lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0))

 # data has 260 rows and 9 columnsdim(lalonde2)

 # set the random seedset.seed(1234)

 # create a test set and training set (lalonde2.train and lalonde2.test)test <- sample(1:nrow(lalonde2), 60)train <- c(1:260)[-test]lalonde2.train <- lalonde2[train,]lalonde2.test <- lalonde2[test,]

 cat("There are", length(which(lalonde2.train$u75 == 1)),

    "unemployed observations in the TRAINING set.")

 ## create a bagged forest, allowing it to try all 8 featuresbag.lalonde2 <- randomForest(u75~., data=lalonde2, subset=train,                           

                             mtry=8, ntree = 5000, importance =TRUE)

 # how well does it perform on the training set?# what is the OOB test error?print(bag.lalonde2)

 # how well does this perform on the test set?cat("There are", length(which(lalonde2.test$u75 == 1)),  "unemployed observations in the TEST set.")

 yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,])table(yhat.bag, lalonde2.test$u75)

 ## check the importance of the variablesimportance(bag.lalonde2)varImpPlot(bag.lalonde2)

 # # # # # #

 # Now re-run for a random forest by setting the# mtry parameter lower--which 'mtry' minimizes training OOB error rate?
	data(lalonde); library(randomForest)

	# create lalonde2 (just control units); delete orig data to avoid mistakeslalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)

	# eliminate the treatment indicator variable (they are all control units)# remove additional columns--we are going to predict "u75" (unemployed in '75)elimin.cols <- which( names(lalonde2) == "treat" \| names(lalonde2) == "re75"\| names(lalonde2) == "re78")

	lalonde2 <- lalonde2[, -elimin.cols]

	# make the dependent variable (what we are trying to predict)# a factor, because random forest will perform classification# for factors, not for 'numeric' variables.# notice that I set the levels when I define this factor...# otherwise, by default, R would set the first factor value# of the first value appearing in the dataset as "0"# even though it is actually a "1"--this could be very confusing.lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0))

	# data has 260 rows and 9 columnsdim(lalonde2)

	# set the random seedset.seed(1234)

	# create a test set and training set (lalonde2.train and lalonde2.test)test <- sample(1:nrow(lalonde2), 60)train <- c(1:260)[-test]lalonde2.train <- lalonde2[train,]lalonde2.test <- lalonde2[test,]

	cat("There are", length(which(lalonde2.train$u75 == 1)),

	"unemployed observations in the TRAINING set.")

	## create a bagged forest, allowing it to try all 8 featuresbag.lalonde2 <- randomForest(u75~., data=lalonde2, subset=train,

	mtry=8, ntree = 5000, importance =TRUE)

	# how well does it perform on the training set?# what is the OOB test error?print(bag.lalonde2)

	# how well does this perform on the test set?cat("There are", length(which(lalonde2.test$u75 == 1)), "unemployed observations in the TEST set.")

	yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,])table(yhat.bag, lalonde2.test$u75)

	## check the importance of the variablesimportance(bag.lalonde2)varImpPlot(bag.lalonde2)

	# # # # # #

	# Now re-run for a random forest by setting the# mtry parameter lower--which 'mtry' minimizes training OOB error rate?
No results found