Created
February 16, 2011 00:59
-
-
Save stephenturner/828631 to your computer and use it in GitHub Desktop.
2011-02-15 rf adiposity.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(randomForest) | |
############################################################################### | |
############################## load functions ################################# | |
############################################################################### | |
# need to document this! | |
rfr2 = function(randomForestModel) { | |
printoutput = capture.output(print(randomForestModel)) | |
varline = grep("explained",printoutput,value=TRUE) | |
if (length(varline)>2) stop("more than two var's explained!") | |
r2out <- NULL | |
for (i in 1:length(varline)) { | |
thisr2=as.numeric(strsplit(varline[i], ":")[[1]][2]) | |
r2out <- c(r2out,thisr2) | |
} | |
#r2out=sapply(r2out, function(n) if(n<0) return(0) else return(n)) | |
if (class(r2out) == "numeric") return(r2out/100) else stop("r2 not numeric, problem") | |
} | |
# Function to plot the importance with the R2 of the model in the title. | |
impplot=function (randomForestModel, maintitle="ethnicgroup") { | |
r2=rfr2(randomForestModel) | |
# single dataset case | |
if (length(r2)==1) { | |
varImpPlot(randomForestModel, pch=16, type=1, | |
main=paste(maintitle, "\nOOB Training R²=" , r2) | |
) | |
} else if (length(r2)==2) { | |
varImpPlot(randomForestModel, pch=16, type=1, | |
main=paste(maintitle, "\nOOB Training R²=" , r2[1],"\nTesting R²=",r2[2]) | |
) | |
} else stop("you have more than two r2s!") | |
} | |
# permute a column in a data.frame | |
permute <- function (df, columnToPermute="column", seed=NULL) { | |
if (!is.null(seed)) set.seed(seed) | |
print(paste("Random seed:",seed)) | |
colindex <- which(names(df)==columnToPermute) | |
permutedcol <- df[ ,colindex][sample(1:nrow(df))] | |
df[colindex] <- permutedcol | |
return(df) | |
} | |
#splitdf splits a data frame into a training and testing set. | |
#returns a list of two data frames: trainset and testset. | |
#you can optionally apply a random seed. | |
splitdf <- function(dataframe, seed=NULL) { | |
if (!is.null(seed)) set.seed(seed) | |
index <- 1:nrow(dataframe) | |
trainindex <- sample(index, trunc(length(index)/2)) | |
trainset <- dataframe[trainindex, ] | |
testset <- dataframe[-trainindex, ] | |
list(trainset=trainset,testset=testset) | |
} | |
#this function utilizes the function above. | |
#you give it a data frame you want to randomize, | |
#and a character vector with column names you want to be sure are | |
#equally distributed among the two different sets. | |
#these columns must be continuous variables. chi2 not yet implemented. | |
splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test")) { | |
d <- dataframe | |
if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe")) | |
ps <- NULL | |
while (is.null(ps) | any(ps<.5)) { | |
set1 <- splitdf(d)$trainset | |
set2 <- splitdf(d)$testset | |
ttestcols <- which(names(d) %in% ttestcolnames) | |
ps <- NULL | |
for (col in ttestcols) { | |
p <- t.test(set1[ ,col], set2[ ,col])$p.value | |
ps=c(ps,p) | |
} | |
print(paste(ttestcolnames," t-test p-value =",ps)) | |
cat("\n") | |
} | |
list(set1=set1,set2=set2) | |
} | |
############################################################################### | |
################################### load data ################################# | |
############################################################################### | |
# Read in the raw data | |
combined=read.csv("C:/Users/turnersd/Documents/Dropbox/Docs/Work/2011-02-14 Obesity project/pilotkeyvars_missingNA.csv") | |
# Ethnicity is coded 1=white, 2=japanese. | |
# This line recodes that numeric variable into a factor variable. | |
combined$ethnicg2 <- factor(combined$ethnicg2, labels=c("White","Japanese")) | |
# Make new datasets. "totfat" contains the variable CRC_FAT_TOT (DXA total | |
# fat) in the first column, then every other clinical / biomarker after | |
# that. "trperi" contains the variable trunk_peri (trunk to peripheral fat | |
# ratio), then every other column after that. | |
matrix(names(combined)) | |
totfat = combined[ ,c(7,2,3,5,6,15:63)] | |
trperi = combined[ ,c(8,2,3,5,6,15:63)] | |
#cbind(matrix(names(totfat)),matrix(names(trperi))) | |
# Do a rough imputation. This sets the NAs to the median (i.e. mimp.totfat) | |
mimp.totfat <- na.roughfix(totfat) | |
mimp.trperi <- na.roughfix(trperi) | |
# impute using the random forest proximity measures | |
set.seed(42) | |
rimp.totfat <- rfImpute(CRC_FAT_TOT~., data=totfat) | |
rimp.trperi <- rfImpute(trunk_peri~., data=trperi) | |
# the methylation profiles were only taken on the 44 women who had MRIs. | |
# worth running again using only the blood markers. | |
#appending the b to the dataset names - blood markers only (no mp_* markers) | |
totfatb=totfat[-(24:46)] | |
trperib=trperi[-(24:46)] | |
#cbind(names(totfatb),names(trperib)) | |
# imputation with blood markers only | |
set.seed(42) | |
rimp.totfatb <- rfImpute(CRC_FAT_TOT~., data=totfatb) | |
rimp.trperib <- rfImpute(trunk_peri~., data=trperib) | |
#rimp.totfatb dataset contains total fat as the dependent variable, and all individuals, imputed using the entire dataset, blood markers only. | |
rimp.totfatb.w <- subset(rimp.totfatb, ethnicg2=="White", select=-ethnicg2) | |
rimp.totfatb.j <- subset(rimp.totfatb, ethnicg2=="Japanese", select=-ethnicg2) | |
rimp.trperib.w <- subset(rimp.trperib, ethnicg2=="White", select=-ethnicg2) | |
rimp.trperib.j <- subset(rimp.trperib, ethnicg2=="Japanese", select=-ethnicg2) | |
rimp.totfat.w <- subset(rimp.totfat, ethnicg2=="White", select=-ethnicg2) | |
rimp.totfat.j <- subset(rimp.totfat, ethnicg2=="Japanese", select=-ethnicg2) | |
rimp.trperi.w <- subset(rimp.trperi, ethnicg2=="White", select=-ethnicg2) | |
rimp.trperi.j <- subset(rimp.trperi, ethnicg2=="Japanese", select=-ethnicg2) | |
cols.to.remove <- c("mp_SLC2A5","mp_H19") | |
rimp.totfat.w <- rimp.totfat.w[ ,-match(cols.to.remove, names(rimp.totfat.w))] | |
rimp.totfat.j <- rimp.totfat.j[ ,-match(cols.to.remove, names(rimp.totfat.j))] | |
rimp.trperi.w <- rimp.trperi.w[ ,-match(cols.to.remove, names(rimp.trperi.w))] | |
rimp.trperi.j <- rimp.trperi.j[ ,-match(cols.to.remove, names(rimp.trperi.j))] | |
#Random splits | |
cols.to.remove <- c("mp_SLC2A5","mp_H19") | |
combined.lowmissing <- combined[ ,-match(cols.to.remove, names(combined))] | |
combined.imputed <- rfImpute(CRC_FAT_TOT~., data=combined.lowmissing) | |
wh <- subset(combined.imputed, ethnicg2=="White") | |
ja <- subset(combined.imputed, ethnicg2=="Japanese") | |
cols.to.randomize <- NULL | |
cols.to.randomize=c(cols.to.randomize,"CRC_FAT_TOT") | |
cols.to.randomize=c(cols.to.randomize,"trunk_peri") | |
cols.to.randomize=c(cols.to.randomize,"logliver") | |
cols.to.randomize=c(cols.to.randomize,"MRI_VISC_PERC_ABDO") | |
cols.to.randomize=c(cols.to.randomize,"visc_subc") | |
cols.to.randomize=c(cols.to.randomize,"CRC_AGE") | |
cols.to.randomize=c(cols.to.randomize,"CRC_TECH_BMI") | |
cols.to.randomize=c(cols.to.randomize,"waisthip_tech_navel") | |
set.seed(808) | |
sets.wh <- splitdf.randomize(wh, cols.to.randomize) | |
sets.ja <- splitdf.randomize(ja, cols.to.randomize) | |
set1 <- rbind(sets.wh$set1, sets.ja$set1) | |
set2 <- rbind(sets.wh$set2, sets.ja$set2) | |
matrix(names(set1)) | |
totfat1 <- set1[ ,c(1,3,4,6,7,15:ncol(set1))] | |
totfat2 <- set2[ ,c(1,3,4,6,7,15:ncol(set2))] | |
trperi1 <- set1[ ,c(8,3,4,6,7,15:ncol(set1))] | |
trperi2 <- set2[ ,c(8,3,4,6,7,15:ncol(set2))] | |
# function to remove all variables that begin with "mp_" | |
remove.mp <- function(df) df[, -(grep("^mp_",names(df)))] | |
totfatb1 <- remove.mp(totfat1) | |
totfatb2 <- remove.mp(totfat2) | |
trperib1 <- remove.mp(trperi1) | |
trperib2 <- remove.mp(trperi2) | |
# In the last few chunks of code I took the combined dataset, imputed | |
# missing values, then did the randomization. I don't want to impute the | |
# outcome for women who didn't have MRI. That's what the next few lines do. | |
cols.to.remove <- c("mp_SLC2A5","mp_H19") | |
combined.lowmissing <- combined[ ,-match(cols.to.remove, names(combined))] | |
haveMRI <- !(is.na(combined.lowmissing$logliver) | is.na(combined.lowmissing$MRI_VISC_PERC_ABDO)) | |
mri <- combined.lowmissing[haveMRI, ] | |
mri.imputed <- rfImpute(CRC_FAT_TOT~., data=mri) | |
mri.wh <- subset(mri.imputed, ethnicg2=="White") #n==28 | |
mri.ja <- subset(mri.imputed, ethnicg2=="Japanese") #n==20 | |
set.seed(42) | |
sets.mri.wh <- splitdf.randomize(mri.wh, cols.to.randomize) | |
sets.mri.ja <- splitdf.randomize(mri.ja, cols.to.randomize) | |
mri.set1 <- rbind(sets.mri.wh$set1, sets.mri.ja$set1) | |
mri.set2 <- rbind(sets.mri.wh$set2, sets.mri.ja$set2) | |
liver1 <- mri.set1[ ,c(11,3,4,6,7,15:ncol(mri.set1))] | |
liver2 <- mri.set2[ ,c(11,3,4,6,7,15:ncol(mri.set2))] | |
visc1 <- mri.set1[ ,c(9,3,4,6,7,15:ncol(mri.set1))] | |
visc2 <- mri.set2[ ,c(9,3,4,6,7,15:ncol(mri.set2))] | |
viscsub1 <- mri.set1[ ,c(14,3,4,6,7,15:ncol(mri.set1))] | |
viscsub2 <- mri.set2[ ,c(14,3,4,6,7,15:ncol(mri.set2))] | |
############################################################################### | |
############### total fat, imputing vs not imputing ########################### | |
############################################################################### | |
# Fit the model, no imputation | |
set.seed(1) | |
totfat.rf <- randomForest( | |
CRC_FAT_TOT~., | |
data=totfat, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
# Print out the model to make sure it did regression and to see the % var explained. | |
print(totfat.rf) | |
rfr2(totfat.rf) | |
plot(totfat.rf) | |
# Plot the importance scores | |
impplot(totfat.rf, "DXA total fat, JA+White") | |
png("totfat, all bmi.png", w=600, h=600) | |
impplot(totfat.rf, "DXA total fat, JA+White") | |
dev.off() | |
############################################################################### | |
# rough imputation on totfat | |
matrix(names(mimp.totfat)) | |
set.seed(2) | |
mimp.totfat.rf <- randomForest( | |
CRC_FAT_TOT~., | |
data=mimp.totfat, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(mimp.totfat.rf) | |
rfr2(mimp.totfat.rf) | |
plot(mimp.totfat.rf) | |
impplot(mimp.totfat.rf, "DXA total fat, JA+White, median-imputed") | |
############################################################################### | |
# using RF Imputation | |
set.seed(3) | |
rimp.totfat.rf <- randomForest( | |
CRC_FAT_TOT~., | |
data=rimp.totfat, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfat.rf) | |
rfr2(rimp.totfat.rf) | |
plot(rimp.totfat.rf) | |
impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed") | |
png("totfat-combined-imputed.png", w=600, h=600) | |
impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed") | |
dev.off() | |
############################################################################### | |
#Plot all of those results | |
png("adiposity-testing-imputation.png",w=1000,h=1500, res=100) | |
par(mfrow=c(3,2)) | |
plot(totfat.rf) | |
impplot(totfat.rf, "DXA total fat, JA+White") | |
plot(mimp.totfat.rf) | |
impplot(mimp.totfat.rf, "DXA total fat, JA+White, median-imputed") | |
plot(rimp.totfat.rf) | |
impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed") | |
dev.off() | |
############################################################################### | |
################## combined, trunk to periphery ratio, imputed ############### | |
############################################################################### | |
set.seed(4) | |
rimp.trperi.rf <- randomForest( | |
trunk_peri~., | |
data=rimp.trperi, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperi.rf) | |
rfr2(rimp.trperi.rf) | |
plot(rimp.trperi.rf) | |
impplot(rimp.trperi.rf, "Trunk:Peri, JA+White, RF-imputed") | |
#What was the actual importance? | |
myimp <- as.data.frame(importance(rimp.trperi.rf)) | |
myimp$var <- row.names(myimp) | |
row.names(myimp) <- NULL | |
subset(myimp, var=="ethnicg2") | |
#plot the importance scores saving to file. | |
png("trunkperi-combined-imputed.png", w=600, h=600) | |
impplot(rimp.trperi.rf, "Trunk:Peri, JA+White, RF-imputed") | |
dev.off() | |
# it seems like ethnicity is no longer important. what happens | |
# if you fit a linear model with WHR? ethnicity is no longer sig! | |
summary(lm(trunk_peri~ethnicg2, data=rimp.trperi)) | |
summary(lm(trunk_peri~ethnicg2+waisthip_tech_navel, data=rimp.trperi)) | |
############################################################################### | |
#what happens if you run the random forests model without allowing WHR? | |
set.seed(5) | |
rimp.trperi.rf.nowhr <- randomForest( | |
trunk_peri~., | |
data=rimp.trperi[which(names(trperi) %nin% "waisthip_tech_navel")], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperi.rf.nowhr) | |
rfr2(rimp.trperi.rf.nowhr) | |
impplot(rimp.trperi.rf.nowhr, "Trunk:Peri, JA+White, RF-imputed, excl WHR") | |
png("trunkperi-combined-imputed-exclwhr.png", w=600, h=600) | |
impplot(rimp.trperi.rf.nowhr, "Trunk:Peri, JA+White, RF-imputed, excl WHR") | |
dev.off() | |
# if you account for some of the other important variables is ethnicity still important? | |
summary(lm(trunk_peri~ethnicg2, data=rimp.trperi)) | |
summary(lm(trunk_peri~ethnicg2+CRC_TECH_BMI+ALSR_Insulin+ALSR_PAI1+lep_adipo+ALSR_25D3+ALSR_Glucose+ALSR_TG+HOMA_IR+ALSR_HDL, data=rimp.trperi)) | |
############################################################################### | |
################## totfat and trperi, with blood markers only################## | |
############################################################################### | |
set.seed(6) | |
rimp.totfatb.rf <- randomForest( | |
CRC_FAT_TOT~., | |
data=rimp.totfatb, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfatb.rf) | |
rfr2(rimp.totfatb.rf) | |
impplot(rimp.totfatb.rf, "TotFat, JA+W, bloodmrks, RFimputed") | |
png("totfat-combined-imputed-bloodmarkersonly.png", w=600, h=600) | |
impplot(rimp.totfatb.rf, "TotFat, JA+W, bldmrks, RFimputed") | |
dev.off() | |
set.seed(7) | |
rimp.trperib.rf <- randomForest( | |
trunk_peri~., | |
data=rimp.trperib, | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperib.rf) | |
rfr2(rimp.trperib.rf) | |
impplot(rimp.trperib.rf, "TotFat, JA+W, bloodmrks, RFimputed") | |
png("trunkperi-combined-imputed-bloodmarkersonly.png", w=600, h=600) | |
impplot(rimp.trperib.rf, "Trunk:Peri, JA+W, bldmrks, RFimputed") | |
dev.off() | |
############################################################################### | |
################## totfat and trperi, splitting by ethnicity ################## | |
############################################################################### | |
# The first thing I wanted to to is to make sure that how I've set up | |
# training and testing works properly. To do this, I took one subset of | |
# the data (white women only), and used that dataset for BOTH training and | |
# testing. Doing this, the testing R² should be extremely high. Then I'll | |
# do the same thing, but permute (shuffle) the outcome variable for the | |
# white women in the testing set. The OOB training R² should still be | |
# high, but the testing R² should plummet to near zero. | |
# train on white, test on white | |
set.seed(8) | |
rimp.totfatb.rf.split.ww <- randomForest( | |
x=rimp.totfatb.w[ ,-1], #exclude the first column, the outcome | |
y=rimp.totfatb.w[ , 1], #include only the first column, the outcome | |
xtest=rimp.totfatb.w[ ,-1], | |
ytest=rimp.totfatb.w[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfatb.rf.split.ww) | |
# train on white, test on permutedwhite | |
set.seed(9) | |
rimp.totfatb.rf.split.wpw <- randomForest( | |
x=rimp.totfatb.w[ ,-1], | |
y=rimp.totfatb.w[ , 1], | |
xtest=rimp.totfatb.w[ ,-1], | |
ytest=permute(rimp.totfatb.w, "CRC_FAT_TOT")[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfatb.rf.split.wpw) | |
############################################################################### | |
## Now, train on one ethnicity, test on another: | |
# train on white, test on JA, totfat, blood markers only: | |
set.seed(10) | |
rimp.totfatb.rf.split.wj <- randomForest( | |
x=rimp.totfatb.w[ ,-1], #exclude the first column, the outcome | |
y=rimp.totfatb.w[ , 1], #include only the first column, the outcome | |
xtest=rimp.totfatb.j[ ,-1], | |
ytest=rimp.totfatb.j[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfatb.rf.split.wj) | |
impplot(rimp.totfatb.rf.split.wj, "TotFat, train:W, test:JA, RF-imputed, bldmrks") | |
# train on white, test on JA, totfat, all markers except H19 and SLC25A: | |
set.seed(100) | |
rimp.totfat.rf.split.wj <- randomForest( | |
x=rimp.totfat.w[ ,-1], #exclude the first column, the outcome | |
y=rimp.totfat.w[ , 1], #include only the first column, the outcome | |
xtest=rimp.totfat.j[ ,-1], | |
ytest=rimp.totfat.j[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfat.rf.split.wj) | |
impplot(rimp.totfat.rf.split.wj, "TotFat, train:W, test:JA, RF-imputed, bldmrks") | |
# train on JA, test on white, totfat: | |
set.seed(10) | |
rimp.totfatb.rf.splitjw<- randomForest( | |
x=rimp.totfatb.j[ ,-1], #exclude the first column, the outcome | |
y=rimp.totfatb.j[ , 1], #include only the first column, the outcome | |
xtest=rimp.totfatb.w[ ,-1], | |
ytest=rimp.totfatb.w[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.totfatb.rf.split.jw) | |
impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA, test:W, RF-imputed, bldmrks") | |
# train on white, test on JA, trperi: | |
set.seed(12) | |
rimp.trperib.rf.split.wj <- randomForest( | |
x=rimp.trperib.w[ ,-1], #exclude the first column, the outcome | |
y=rimp.trperib.w[ , 1], #include only the first column, the outcome | |
xtest=rimp.trperib.j[ ,-1], | |
ytest=rimp.trperib.j[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperib.rf.split.wj) | |
impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA, RF-imputed, bldmrks") | |
# train on JA, test on white, trperi: | |
set.seed(13) | |
rimp.trperib.rf.split.jw <- randomForest( | |
x=rimp.trperib.j[ ,-1], #exclude the first column, the outcome | |
y=rimp.trperib.j[ , 1], #include only the first column, the outcome | |
xtest=rimp.trperib.w[ ,-1], | |
ytest=rimp.trperib.w[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperib.rf.split.jw) | |
impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W, RF-imputed, bldmrks") | |
# train on JA, test on white, trperi, all markers except H19 and HLC2A5: | |
set.seed(130) | |
rimp.trperi.rf.split.jw <- randomForest( | |
x=rimp.trperi.j[ ,-1], #exclude the first column, the outcome | |
y=rimp.trperi.j[ , 1], #include only the first column, the outcome | |
xtest=rimp.trperi.w[ ,-1], | |
ytest=rimp.trperi.w[ , 1], | |
importance=TRUE, | |
keep.forest=TRUE, | |
na.action=na.omit | |
) | |
print(rimp.trperi.rf.split.jw) | |
impplot(rimp.trperi.rf.split.jw, "Tr/peri, train:JA, test:W, RF-imputed, bldmrks") | |
# Totfat, train on white, test on JA | |
print(rimp.totfatb.rf.split.wj) | |
# Totfat, train on JA, test on white | |
print(rimp.totfatb.rf.split.jw) | |
# Trunk/peri train on white, test on JA | |
print(rimp.trperib.rf.split.wj) | |
# Trunk/peri train on JA, test on white | |
print(rimp.trperib.rf.split.jw) | |
png("totfat-WJ.png", w=600, h=600) | |
impplot(rimp.totfatb.rf.split.wj, "TotFat, train:W, test:JA") | |
dev.off() | |
png("totfat-JW.png", w=600, h=600) | |
impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA, test:W") | |
dev.off() | |
png("trperi-WJ.png", w=600, h=600) | |
impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA") | |
dev.off() | |
png("trperi-JW.png", w=600, h=600) | |
impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W") | |
dev.off() | |
png("both-totfat-trperi-wj-jw.png", w=1200, h=1200) | |
par(mfrow=c(2,2)) | |
impplot(rimp.totfatb.rf.split.wj, "TotFat, train:W, test:JA") | |
impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA") | |
impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA, test:W") | |
impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W") | |
dev.off() | |
################ | |
#randomization | |
#try fitting a stepwise model. The predictive R2 is way worse. | |
lmfit <- lm(CRC_FAT_TOT~., data=totfat1) | |
stepfit <- step(lmfit, direction="forward") | |
steppred <- predict(stepfit, totfat2[ ,-1]) | |
rsq(steppred,totfat2[ ,1]) | |
#now fit the random forest model | |
set.seed(14) | |
totfat.rf.split12 <- randomForest( | |
x=totfat1[ ,-1], #exclude the first column, the outcome | |
y=totfat1[ , 1], #include only the first column, the outcome | |
xtest=totfat2[ ,-1], | |
ytest=totfat2[ , 1], | |
keep.forest=TRUE, | |
importance=TRUE, | |
) | |
print(totfat.rf.split12) | |
rsq(predict(totfat.rf.split12, totfat2[ ,-1]), totfat2[ ,1]) | |
png("totfat-15split.png",w=600, h=600) | |
impplot(totfat.rf.split12, "Total fat, 15/15 random") | |
dev.off() | |
set.seed(15) | |
trperi.rf.split12 <- randomForest( | |
x=trperi1[ ,-1], #exclude the first column, the outcome | |
y=trperi1[ , 1], #include only the first column, the outcome | |
xtest=trperi2[ ,-1], | |
ytest=trperi2[ , 1], | |
keep.forest=TRUE, | |
importance=TRUE, | |
) | |
print(trperi.rf.split12) | |
rsq(predict(trperi.rf.split12, trperi2[ ,-1]), trperi2$trunk_peri) | |
png("trperi-15split.png",w=600, h=600) | |
impplot(trperi.rf.split12, "Trunk-peri, 15/15 random") | |
dev.off() | |
################################################### | |
# logliver | |
set.seed(15) | |
liver.rf.split12 <- randomForest( | |
x=liver1[ ,-1], | |
y=liver1[ , 1], | |
xtest=liver2[ ,-1], | |
ytest=liver2[ , 1], | |
keep.forest=TRUE, | |
importance=TRUE, | |
) | |
print(liver.rf.split12) | |
rsq(predict(liver.rf.split12, liver2[ ,-1]), liver2[ ,1]) | |
png("liver-split.png",w=600, h=600) | |
impplot(liver.rf.split12, "logliver, 14w/10j split") | |
dev.off() | |
# visceral | |
set.seed(16) | |
visc.rf.split12 <- randomForest( | |
x=visc1[ ,-1], | |
y=visc1[ , 1], | |
xtest=visc2[ ,-1], | |
ytest=visc2[ , 1], | |
keep.forest=TRUE, | |
importance=TRUE, | |
) | |
print(visc.rf.split12) | |
rsq(predict(visc.rf.split12, visc2[ ,-1]), visc2[ ,1]) | |
png("visc-split.png",w=600, h=600) | |
impplot(visc.rf.split12, "%visceral, 14w/10j split") | |
dev.off() | |
# visceral/subc ratio | |
set.seed(16) | |
viscsub.rf.split12 <- randomForest( | |
x=viscsub1[ ,-1], | |
y=viscsub1[ , 1], | |
xtest=viscsub2[ ,-1], | |
ytest=viscsub2[ , 1], | |
keep.forest=TRUE, | |
importance=TRUE, | |
) | |
print(viscsub.rf.split12) | |
rsq(predict(viscsub.rf.split12, viscsub2[ ,-1]), viscsub2[ ,1]) | |
png("viscsub-split.png",w=600, h=600) | |
impplot(viscsub.rf.split12, "visceral/subratio, 14w/10j split") | |
dev.off() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment