Created
December 3, 2016 14:45
-
-
Save diamonaj/0da5e77fb8435f5c74b23919f6b6e6c3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data(lalonde); library(randomForest) | |
| # create lalonde2 (just control units); delete orig data to avoid mistakeslalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde) | |
| # eliminate the treatment indicator variable (they are all control units)# remove additional columns--we are going to predict "u75" (unemployed in '75)elimin.cols <- which( names(lalonde2) == "treat" | names(lalonde2) == "re75"| names(lalonde2) == "re78") | |
| lalonde2 <- lalonde2[, -elimin.cols] | |
| # make the dependent variable (what we are trying to predict)# a factor, because random forest will perform classification# for factors, not for 'numeric' variables.# notice that I set the levels when I define this factor...# otherwise, by default, R would set the first factor value# of the first value appearing in the dataset as "0"# even though it is actually a "1"--this could be very confusing.lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0)) | |
| # data has 260 rows and 9 columnsdim(lalonde2) | |
| # set the random seedset.seed(1234) | |
| # create a test set and training set (lalonde2.train and lalonde2.test)test <- sample(1:nrow(lalonde2), 60)train <- c(1:260)[-test]lalonde2.train <- lalonde2[train,]lalonde2.test <- lalonde2[test,] | |
| cat("There are", length(which(lalonde2.train$u75 == 1)), | |
| "unemployed observations in the TRAINING set.") | |
| ## create a bagged forest, allowing it to try all 8 featuresbag.lalonde2 <- randomForest(u75~., data=lalonde2, subset=train, | |
| mtry=8, ntree = 5000, importance =TRUE) | |
| # how well does it perform on the training set?# what is the OOB test error?print(bag.lalonde2) | |
| # how well does this perform on the test set?cat("There are", length(which(lalonde2.test$u75 == 1)), "unemployed observations in the TEST set.") | |
| yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,])table(yhat.bag, lalonde2.test$u75) | |
| ## check the importance of the variablesimportance(bag.lalonde2)varImpPlot(bag.lalonde2) | |
| # # # # # # | |
| # Now re-run for a random forest by setting the# mtry parameter lower--which 'mtry' minimizes training OOB error rate? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment