Last active
December 5, 2016 07:49
-
-
Save diamonaj/287193a4d15d1f5c00dcd9283d179fdb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data(lalonde); library(randomForest) | |
| # create lalonde2 (just control units); delete orig data to avoid mistakes | |
| lalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde) | |
| # eliminate the treatment indicator variable (they are all control units) | |
| # remove additional columns--we are going to predict "u75" (unemployed in '75) | |
| elimin.cols <- which( names(lalonde2) == "treat" | | |
| names(lalonde2) == "re75"| | |
| names(lalonde2) == "re78") | |
| lalonde2 <- lalonde2[, -elimin.cols] | |
| # make the dependent variable (what we are trying to predict) | |
| # a factor, because random forest will perform classification | |
| # for factors, not for 'numeric' variables. | |
| # notice that I set the levels when I define this factor... | |
| # otherwise, by default, R would set the first factor value | |
| # of the first value appearing in the dataset as "0" | |
| # even though it is actually a "1"--this could be very confusing. | |
| lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0)) | |
| # data has 260 rows and 9 columns | |
| dim(lalonde2) | |
| # set the random seedset.seed(1234) | |
| # create a test set and training set | |
| test <- sample(1:nrow(lalonde2), 60) | |
| train <- c(1:260)[-test] | |
| lalonde2.train <- lalonde2[train,] | |
| lalonde2.test <- lalonde2[test,] | |
| cat("There are", length(which(lalonde2.train$u75 == 1)), | |
| "unemployed observations in the TRAINING set.") | |
| ## create a bagged forest, allowing it to try all 8 features | |
| bag.lalonde2 <- randomForest( u75~., data=lalonde2, subset=train, | |
| mtry=8, ntree = 5000, importance =TRUE) | |
| # how well does it perform on the training set? | |
| # what is the OOB test error? | |
| print(bag.lalonde2) | |
| # how well does this perform on the test set? | |
| cat("There are", length(which(lalonde2.test$u75 == 1)), | |
| "unemployed observations in the TEST set.") | |
| yhat.bag <- predict(bag.lalonde2, newdata=lalonde2[test,]) | |
| table(yhat.bag, lalonde2.test$u75) | |
| ## check the importance of the variables | |
| importance(bag.lalonde2) | |
| varImpPlot(bag.lalonde2) | |
| # # # # # # | |
| # Now re-run for a random forest by setting the# mtry parameter lower-- | |
| # which 'mtry' minimizes training OOB error rate? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment