diamonaj · December 5, 2016 07:49
diff --git a/CS112 14.1 lalonde breakout b/CS112 14.1 lalonde breakout
 data(lalonde); library(randomForest)

 # create lalonde2 (just control units); delete orig data to avoid mistakes
 lalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)

 # eliminate the treatment indicator variable (they are all control units)
 # remove additional columns--we are going to predict "u75" (unemployed in '75)
 elimin.cols <- which( names(lalonde2) == "treat" |  
                      names(lalonde2) == "re75"| 
                      names(lalonde2) == "re78")

 lalonde2 <- lalonde2[, -elimin.cols]

 # make the dependent variable (what we are trying to predict)
 # a factor, because random forest will perform classification
 # for factors, not for 'numeric' variables.
 # notice that I set the levels when I define this factor...
 # otherwise, by default, R would set the first factor value
 # of the first value appearing in the dataset as "0"
 # even though it is actually a "1"--this could be very confusing.
 lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0))

 # data has 260 rows and 9 columns
 dim(lalonde2)

 # set the random seedset.seed(1234)

 # create a test set and training set 
 test <- sample(1:nrow(lalonde2), 60)
 train <- c(1:260)[-test]
 lalonde2.train <- lalonde2[train,]
 lalonde2.test <- lalonde2[test,]

 cat("There are", length(which(lalonde2.train$u75 == 1)),
    "unemployed observations in the TRAINING set.")

 ## create a bagged forest, allowing it to try all 8 features
 bag.lalonde2 <- randomForest( u75~., data=lalonde2, subset=train, 
                              mtry=8, ntree = 5000, importance =TRUE)

 # how well does it perform on the training set?
 # what is the OOB test error?
 print(bag.lalonde2)

 # how well does this perform on the test set?
 cat("There are", length(which(lalonde2.test$u75 == 1)),
    "unemployed observations in the TEST set.")

 yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,])
 table(yhat.bag, lalonde2.test$u75)

 ## check the importance of the variables
 importance(bag.lalonde2)
 varImpPlot(bag.lalonde2)

 # # # # # #
 # Now re-run for a random forest by setting the# mtry parameter lower--
 # which 'mtry' minimizes training OOB error rate?
	data(lalonde); library(randomForest)

	# create lalonde2 (just control units); delete orig data to avoid mistakes
	lalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)

	# eliminate the treatment indicator variable (they are all control units)
	# remove additional columns--we are going to predict "u75" (unemployed in '75)
	elimin.cols <- which( names(lalonde2) == "treat" \|
	names(lalonde2) == "re75"\|
	names(lalonde2) == "re78")

	lalonde2 <- lalonde2[, -elimin.cols]

	# make the dependent variable (what we are trying to predict)
	# a factor, because random forest will perform classification
	# for factors, not for 'numeric' variables.
	# notice that I set the levels when I define this factor...
	# otherwise, by default, R would set the first factor value
	# of the first value appearing in the dataset as "0"
	# even though it is actually a "1"--this could be very confusing.
	lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0))

	# data has 260 rows and 9 columns
	dim(lalonde2)

	# set the random seedset.seed(1234)

	# create a test set and training set
	test <- sample(1:nrow(lalonde2), 60)
	train <- c(1:260)[-test]
	lalonde2.train <- lalonde2[train,]
	lalonde2.test <- lalonde2[test,]

	cat("There are", length(which(lalonde2.train$u75 == 1)),
	"unemployed observations in the TRAINING set.")

	## create a bagged forest, allowing it to try all 8 features
	bag.lalonde2 <- randomForest( u75~., data=lalonde2, subset=train,
	mtry=8, ntree = 5000, importance =TRUE)

	# how well does it perform on the training set?
	# what is the OOB test error?
	print(bag.lalonde2)

	# how well does this perform on the test set?
	cat("There are", length(which(lalonde2.test$u75 == 1)),
	"unemployed observations in the TEST set.")

	yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,])
	table(yhat.bag, lalonde2.test$u75)

	## check the importance of the variables
	importance(bag.lalonde2)
	varImpPlot(bag.lalonde2)

	# # # # # #
	# Now re-run for a random forest by setting the# mtry parameter lower--
	# which 'mtry' minimizes training OOB error rate?
No results found