stephenturner · March 1, 2011 19:31
diff --git a/RF vs lm and FS 1.r b/RF vs lm and FS 1.r
 > #First define the function
 > rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
 > 
 > # first, fit a stepwise model and test it on new data
 > totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
 > totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
 > totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
 > rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
 [1] 0.6991431
 > 
 > # Fit and test the RF model
 > set.seed(14)
 > totfat.rf.split12 <- randomForest(
 + x=totfat1[ ,-1], #exclude the first column, the outcome
 + y=totfat1[ , 1], #include only the first column, the outcome
 + xtest=totfat2[ ,-1],
 + ytest=totfat2[ , 1],
 + keep.forest=TRUE,
 + importance=TRUE, 
 + )
 > print(totfat.rf.split12)

 Call:
 randomForest(x = totfat1[, -1], y = totfat1[, 1], xtest = totfat2[,      -1], ytest = totfat2[, 1], importance = TRUE, keep.forest = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
 No. of variables tried at each split: 17

          Mean of squared residuals: 24052766
                    % Var explained: 69.34
                       Test set MSE: 18126592
                    % Var explained: 80.51
 > rsq(predict(totfat.rf.split12, totfat2), totfat2[ ,1])
 [1] 0.799995
 > 
 > # now fit a linear model for total fat with the important variables
 > totfat1.lm <- lm(CRC_FAT_TOT ~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, data=totfat1)
 > 
 > #how well does that model explain the data?
 > summary(totfat1.lm)

 Call:
 lm(formula = CRC_FAT_TOT ~ CRC_TECH_BMI + ALSR_Leptin + lep_adipo, 
    data = totfat1)

 Residuals:
    Min      1Q  Median      3Q     Max 
 -5565.7 -1381.8   249.6  1535.7  4905.7 

 Coefficients:
             Estimate Std. Error t value Pr(>|t|)
 (Intercept)   -7532.9     3526.6  -2.136   0.0395
 CRC_TECH_BMI   1100.5      165.3   6.658 9.27e-08
 ALSR_Leptin     255.0       54.7   4.662 4.20e-05
 lep_adipo      -128.6      106.3  -1.210   0.2342

 Residual standard error: 2623 on 36 degrees of freedom
 Multiple R-squared: 0.9211,     Adjusted R-squared: 0.9145 
 F-statistic: 140.1 on 3 and 36 DF,  p-value: < 2.2e-16 

 > 
 > #how much variation in new data does the model explain?
 > rsq(predict(totfat1.lm, totfat2), totfat2$CRC_FAT_TOT)
 [1] 0.9011825
 > 
 > # what if i were to restrict the random forest procedure to only allow 
 > # the variables that i found were important modeling the whole dataset?
 > set.seed(42)
 > totfat1.rf.restricted <- randomForest(
 + CRC_FAT_TOT~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, 
 + data=totfat1, keep.forest=TRUE, importance=TRUE)
 > rsq(predict(totfat1.rf.restricted,totfat2),totfat2$CRC_FAT_TOT)
 [1] 0.8820904
	> #First define the function
	> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
	>
	> # first, fit a stepwise model and test it on new data
	> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
	> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
	> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
	> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.6991431
	>
	> # Fit and test the RF model
	> set.seed(14)
	> totfat.rf.split12 <- randomForest(
	+ x=totfat1[ ,-1], #exclude the first column, the outcome
	+ y=totfat1[ , 1], #include only the first column, the outcome
	+ xtest=totfat2[ ,-1],
	+ ytest=totfat2[ , 1],
	+ keep.forest=TRUE,
	+ importance=TRUE,
	+ )
	> print(totfat.rf.split12)

	Call:
	randomForest(x = totfat1[, -1], y = totfat1[, 1], xtest = totfat2[, -1], ytest = totfat2[, 1], importance = TRUE, keep.forest = TRUE)
	Type of random forest: regression
	Number of trees: 500
	No. of variables tried at each split: 17

	Mean of squared residuals: 24052766
	% Var explained: 69.34
	Test set MSE: 18126592
	% Var explained: 80.51
	> rsq(predict(totfat.rf.split12, totfat2), totfat2[ ,1])
	[1] 0.799995
	>
	> # now fit a linear model for total fat with the important variables
	> totfat1.lm <- lm(CRC_FAT_TOT ~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, data=totfat1)
	>
	> #how well does that model explain the data?
	> summary(totfat1.lm)

	Call:
	lm(formula = CRC_FAT_TOT ~ CRC_TECH_BMI + ALSR_Leptin + lep_adipo,
	data = totfat1)

	Residuals:
	Min 1Q Median 3Q Max
	-5565.7 -1381.8 249.6 1535.7 4905.7

	Coefficients:
	Estimate Std. Error t value Pr(>\|t\|)
	(Intercept) -7532.9 3526.6 -2.136 0.0395
	CRC_TECH_BMI 1100.5 165.3 6.658 9.27e-08
	ALSR_Leptin 255.0 54.7 4.662 4.20e-05
	lep_adipo -128.6 106.3 -1.210 0.2342

	Residual standard error: 2623 on 36 degrees of freedom
	Multiple R-squared: 0.9211, Adjusted R-squared: 0.9145
	F-statistic: 140.1 on 3 and 36 DF, p-value: < 2.2e-16

	>
	> #how much variation in new data does the model explain?
	> rsq(predict(totfat1.lm, totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.9011825
	>
	> # what if i were to restrict the random forest procedure to only allow
	> # the variables that i found were important modeling the whole dataset?
	> set.seed(42)
	> totfat1.rf.restricted <- randomForest(
	+ CRC_FAT_TOT~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo,
	+ data=totfat1, keep.forest=TRUE, importance=TRUE)
	> rsq(predict(totfat1.rf.restricted,totfat2),totfat2$CRC_FAT_TOT)
	[1] 0.8820904