Created
March 1, 2011 19:31
-
-
Save stephenturner/849719 to your computer and use it in GitHub Desktop.
RF vs lm and FS 1.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> #First define the function | |
> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2) | |
> | |
> # first, fit a stepwise model and test it on new data | |
> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1) | |
> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1) | |
> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0) | |
> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT) | |
[1] 0.6991431 | |
> | |
> # Fit and test the RF model | |
> set.seed(14) | |
> totfat.rf.split12 <- randomForest( | |
+ x=totfat1[ ,-1], #exclude the first column, the outcome | |
+ y=totfat1[ , 1], #include only the first column, the outcome | |
+ xtest=totfat2[ ,-1], | |
+ ytest=totfat2[ , 1], | |
+ keep.forest=TRUE, | |
+ importance=TRUE, | |
+ ) | |
> print(totfat.rf.split12) | |
Call: | |
randomForest(x = totfat1[, -1], y = totfat1[, 1], xtest = totfat2[, -1], ytest = totfat2[, 1], importance = TRUE, keep.forest = TRUE) | |
Type of random forest: regression | |
Number of trees: 500 | |
No. of variables tried at each split: 17 | |
Mean of squared residuals: 24052766 | |
% Var explained: 69.34 | |
Test set MSE: 18126592 | |
% Var explained: 80.51 | |
> rsq(predict(totfat.rf.split12, totfat2), totfat2[ ,1]) | |
[1] 0.799995 | |
> | |
> # now fit a linear model for total fat with the important variables | |
> totfat1.lm <- lm(CRC_FAT_TOT ~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, data=totfat1) | |
> | |
> #how well does that model explain the data? | |
> summary(totfat1.lm) | |
Call: | |
lm(formula = CRC_FAT_TOT ~ CRC_TECH_BMI + ALSR_Leptin + lep_adipo, | |
data = totfat1) | |
Residuals: | |
Min 1Q Median 3Q Max | |
-5565.7 -1381.8 249.6 1535.7 4905.7 | |
Coefficients: | |
Estimate Std. Error t value Pr(>|t|) | |
(Intercept) -7532.9 3526.6 -2.136 0.0395 | |
CRC_TECH_BMI 1100.5 165.3 6.658 9.27e-08 | |
ALSR_Leptin 255.0 54.7 4.662 4.20e-05 | |
lep_adipo -128.6 106.3 -1.210 0.2342 | |
Residual standard error: 2623 on 36 degrees of freedom | |
Multiple R-squared: 0.9211, Adjusted R-squared: 0.9145 | |
F-statistic: 140.1 on 3 and 36 DF, p-value: < 2.2e-16 | |
> | |
> #how much variation in new data does the model explain? | |
> rsq(predict(totfat1.lm, totfat2), totfat2$CRC_FAT_TOT) | |
[1] 0.9011825 | |
> | |
> # what if i were to restrict the random forest procedure to only allow | |
> # the variables that i found were important modeling the whole dataset? | |
> set.seed(42) | |
> totfat1.rf.restricted <- randomForest( | |
+ CRC_FAT_TOT~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, | |
+ data=totfat1, keep.forest=TRUE, importance=TRUE) | |
> rsq(predict(totfat1.rf.restricted,totfat2),totfat2$CRC_FAT_TOT) | |
[1] 0.8820904 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment