stephenturner · February 16, 2011 00:59
diff --git a/2011-02-15 rf adiposity.r b/2011-02-15 rf adiposity.r
 library(randomForest)

 ###############################################################################
 ############################## load functions #################################
 ###############################################################################

 # need to document this!
 rfr2 = function(randomForestModel) {
 	printoutput = capture.output(print(randomForestModel))
 	varline = grep("explained",printoutput,value=TRUE)
 	if (length(varline)>2) stop("more than two var's explained!")
 	r2out <- NULL
 	for (i in 1:length(varline)) {
 		thisr2=as.numeric(strsplit(varline[i], ":")[[1]][2])
 		r2out <- c(r2out,thisr2)
 	}
 	#r2out=sapply(r2out,  function(n) if(n<0) return(0) else return(n))
 	if (class(r2out) == "numeric") return(r2out/100) else stop("r2 not numeric, problem")
 }

 # Function to plot the importance with the R2 of the model in the title.
 impplot=function (randomForestModel, maintitle="ethnicgroup") {
 	r2=rfr2(randomForestModel)
 	# single dataset case
 	if (length(r2)==1) {
 		varImpPlot(randomForestModel, pch=16, type=1,
 			main=paste(maintitle, "\nOOB Training R²=" , r2)
 		)
 	} else if (length(r2)==2) {
 		varImpPlot(randomForestModel, pch=16, type=1,
 			main=paste(maintitle, "\nOOB Training R²=" , r2[1],"\nTesting R²=",r2[2])
 		)
 	} else stop("you have more than two r2s!")
 }
 			
 # permute a column in a data.frame
 permute <- function (df, columnToPermute="column", seed=NULL) {
 	if (!is.null(seed)) set.seed(seed)
 	print(paste("Random seed:",seed))
 	colindex <- which(names(df)==columnToPermute)
 	permutedcol <- df[ ,colindex][sample(1:nrow(df))]
 	df[colindex] <- permutedcol
 	return(df)
 }

 #splitdf splits a data frame into a training and testing set.
 #returns a list of two data frames: trainset and testset.
 #you can optionally apply a random seed.
 splitdf <- function(dataframe, seed=NULL) {
 	if (!is.null(seed)) set.seed(seed)
 	index <- 1:nrow(dataframe)
 	trainindex <- sample(index, trunc(length(index)/2))
 	trainset <- dataframe[trainindex, ]
 	testset <- dataframe[-trainindex, ]
 	list(trainset=trainset,testset=testset)
 }

 #this function utilizes the function above.
 #you give it a data frame you want to randomize,
 #and a character vector with column names you want to be sure are 
 #equally distributed among the two different sets.
 #these columns must be continuous variables. chi2 not yet implemented.
 splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test")) {
 	d <- dataframe
 	if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
 	ps <- NULL
 	while (is.null(ps) | any(ps<.5)) {
 		set1 <- splitdf(d)$trainset
 		set2 <- splitdf(d)$testset
 		ttestcols <- which(names(d) %in% ttestcolnames)
 		ps <- NULL
 		for (col in ttestcols) {
 			p <- t.test(set1[ ,col], set2[ ,col])$p.value
 			ps=c(ps,p)
 		}
 		print(paste(ttestcolnames," t-test p-value =",ps))
 		cat("\n")
 	}
 	list(set1=set1,set2=set2)
 }


 ###############################################################################
 ################################### load data #################################
 ###############################################################################


 # Read in the raw data
 combined=read.csv("C:/Users/turnersd/Documents/Dropbox/Docs/Work/2011-02-14 Obesity project/pilotkeyvars_missingNA.csv")

 # Ethnicity is coded 1=white, 2=japanese. 
 # This line recodes that numeric variable into a factor variable.
 combined$ethnicg2 <- factor(combined$ethnicg2, labels=c("White","Japanese"))


 # Make new datasets. "totfat" contains the variable CRC_FAT_TOT (DXA total 
 # fat) in the first column, then every other clinical / biomarker after 
 # that. "trperi" contains the variable trunk_peri (trunk to peripheral fat 
 # ratio), then every other column after that. 
 matrix(names(combined))
 totfat = combined[ ,c(7,2,3,5,6,15:63)]
 trperi = combined[ ,c(8,2,3,5,6,15:63)]
 #cbind(matrix(names(totfat)),matrix(names(trperi)))

 # Do a rough imputation. This sets the NAs to the median (i.e. mimp.totfat)
 mimp.totfat <- na.roughfix(totfat)
 mimp.trperi <- na.roughfix(trperi)

 # impute using the random forest proximity measures
 set.seed(42)
 rimp.totfat <- rfImpute(CRC_FAT_TOT~., data=totfat)
 rimp.trperi <- rfImpute(trunk_peri~., data=trperi)

 # the methylation profiles were only taken on the 44 women who had MRIs. 
 # worth running again using only the blood markers.
 #appending the b to the dataset names - blood markers only (no mp_* markers)
 totfatb=totfat[-(24:46)]
 trperib=trperi[-(24:46)]
 #cbind(names(totfatb),names(trperib))

 # imputation with blood markers only
 set.seed(42)
 rimp.totfatb <- rfImpute(CRC_FAT_TOT~., data=totfatb)
 rimp.trperib <- rfImpute(trunk_peri~., data=trperib)


 #rimp.totfatb dataset contains total fat as the dependent variable, and all individuals, imputed using the entire dataset, blood markers only.
 rimp.totfatb.w <- subset(rimp.totfatb, ethnicg2=="White", select=-ethnicg2)
 rimp.totfatb.j <- subset(rimp.totfatb, ethnicg2=="Japanese", select=-ethnicg2)
 rimp.trperib.w <- subset(rimp.trperib, ethnicg2=="White", select=-ethnicg2)
 rimp.trperib.j <- subset(rimp.trperib, ethnicg2=="Japanese", select=-ethnicg2)

 rimp.totfat.w <- subset(rimp.totfat, ethnicg2=="White", select=-ethnicg2)
 rimp.totfat.j <- subset(rimp.totfat, ethnicg2=="Japanese", select=-ethnicg2)
 rimp.trperi.w <- subset(rimp.trperi, ethnicg2=="White", select=-ethnicg2)
 rimp.trperi.j <- subset(rimp.trperi, ethnicg2=="Japanese", select=-ethnicg2)

 cols.to.remove <- c("mp_SLC2A5","mp_H19")
 rimp.totfat.w <- rimp.totfat.w[ ,-match(cols.to.remove, names(rimp.totfat.w))]
 rimp.totfat.j <- rimp.totfat.j[ ,-match(cols.to.remove, names(rimp.totfat.j))]
 rimp.trperi.w <- rimp.trperi.w[ ,-match(cols.to.remove, names(rimp.trperi.w))]
 rimp.trperi.j <- rimp.trperi.j[ ,-match(cols.to.remove, names(rimp.trperi.j))]

 #Random splits
 cols.to.remove <- c("mp_SLC2A5","mp_H19")
 combined.lowmissing <- combined[ ,-match(cols.to.remove, names(combined))]
 combined.imputed <- rfImpute(CRC_FAT_TOT~., data=combined.lowmissing)

 wh <- subset(combined.imputed, ethnicg2=="White")
 ja <- subset(combined.imputed, ethnicg2=="Japanese")

 cols.to.randomize <- NULL
 cols.to.randomize=c(cols.to.randomize,"CRC_FAT_TOT")
 cols.to.randomize=c(cols.to.randomize,"trunk_peri")
 cols.to.randomize=c(cols.to.randomize,"logliver")
 cols.to.randomize=c(cols.to.randomize,"MRI_VISC_PERC_ABDO")
 cols.to.randomize=c(cols.to.randomize,"visc_subc")
 cols.to.randomize=c(cols.to.randomize,"CRC_AGE")
 cols.to.randomize=c(cols.to.randomize,"CRC_TECH_BMI")
 cols.to.randomize=c(cols.to.randomize,"waisthip_tech_navel")

 set.seed(808)
 sets.wh <- splitdf.randomize(wh, cols.to.randomize)
 sets.ja <- splitdf.randomize(ja, cols.to.randomize)
 set1 <- rbind(sets.wh$set1, sets.ja$set1)
 set2 <- rbind(sets.wh$set2, sets.ja$set2)

 matrix(names(set1))
 totfat1 <- set1[ ,c(1,3,4,6,7,15:ncol(set1))]
 totfat2 <- set2[ ,c(1,3,4,6,7,15:ncol(set2))]
 trperi1 <- set1[ ,c(8,3,4,6,7,15:ncol(set1))]
 trperi2 <- set2[ ,c(8,3,4,6,7,15:ncol(set2))]

 # function to remove all variables that begin with "mp_"
 remove.mp <- function(df) df[, -(grep("^mp_",names(df)))]
 totfatb1 <- remove.mp(totfat1)
 totfatb2 <- remove.mp(totfat2)
 trperib1 <- remove.mp(trperi1)
 trperib2 <- remove.mp(trperi2)


 # In the last few chunks of code I took the combined dataset, imputed 
 # missing values, then did the randomization. I don't want to impute the 
 # outcome for women who didn't have MRI. That's what the next few lines do.
 cols.to.remove <- c("mp_SLC2A5","mp_H19")
 combined.lowmissing <- combined[ ,-match(cols.to.remove, names(combined))]
 haveMRI <- !(is.na(combined.lowmissing$logliver) | is.na(combined.lowmissing$MRI_VISC_PERC_ABDO))
 mri <- combined.lowmissing[haveMRI, ]
 mri.imputed <- rfImpute(CRC_FAT_TOT~., data=mri)

 mri.wh <- subset(mri.imputed, ethnicg2=="White") #n==28
 mri.ja <- subset(mri.imputed, ethnicg2=="Japanese") #n==20

 set.seed(42)
 sets.mri.wh <- splitdf.randomize(mri.wh, cols.to.randomize)
 sets.mri.ja <- splitdf.randomize(mri.ja, cols.to.randomize)
 mri.set1 <- rbind(sets.mri.wh$set1, sets.mri.ja$set1)
 mri.set2 <- rbind(sets.mri.wh$set2, sets.mri.ja$set2)

 liver1 <- mri.set1[ ,c(11,3,4,6,7,15:ncol(mri.set1))]
 liver2 <- mri.set2[ ,c(11,3,4,6,7,15:ncol(mri.set2))]
 visc1 <- mri.set1[ ,c(9,3,4,6,7,15:ncol(mri.set1))]
 visc2 <- mri.set2[ ,c(9,3,4,6,7,15:ncol(mri.set2))]
 viscsub1 <- mri.set1[ ,c(14,3,4,6,7,15:ncol(mri.set1))]
 viscsub2 <- mri.set2[ ,c(14,3,4,6,7,15:ncol(mri.set2))]


 ###############################################################################
 ############### total fat, imputing vs not imputing ###########################
 ###############################################################################

 # Fit the model, no imputation
 set.seed(1)
 totfat.rf <- randomForest(
 	CRC_FAT_TOT~., 
 	data=totfat, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 # Print out the model to make sure it did regression and to see the % var explained.
 print(totfat.rf)
 rfr2(totfat.rf)
 plot(totfat.rf)

 # Plot the importance scores
 impplot(totfat.rf, "DXA total fat, JA+White")

 png("totfat, all bmi.png", w=600, h=600)
 impplot(totfat.rf, "DXA total fat, JA+White")
 dev.off()

 ###############################################################################

 # rough imputation on totfat
 matrix(names(mimp.totfat))

 set.seed(2)
 mimp.totfat.rf <- randomForest(
 	CRC_FAT_TOT~., 
 	data=mimp.totfat, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(mimp.totfat.rf)
 rfr2(mimp.totfat.rf)
 plot(mimp.totfat.rf)
 impplot(mimp.totfat.rf, "DXA total fat, JA+White, median-imputed")

 ###############################################################################

 # using RF Imputation
 set.seed(3)
 rimp.totfat.rf <- randomForest(
 	CRC_FAT_TOT~., 
 	data=rimp.totfat, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(rimp.totfat.rf)
 rfr2(rimp.totfat.rf)
 plot(rimp.totfat.rf)
 impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed")

 png("totfat-combined-imputed.png", w=600, h=600)
 impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed")
 dev.off()

 ###############################################################################

 #Plot all of those results
 png("adiposity-testing-imputation.png",w=1000,h=1500, res=100)
 par(mfrow=c(3,2))
 plot(totfat.rf)
 impplot(totfat.rf, "DXA total fat, JA+White")
 plot(mimp.totfat.rf)
 impplot(mimp.totfat.rf, "DXA total fat, JA+White, median-imputed")
 plot(rimp.totfat.rf)
 impplot(rimp.totfat.rf, "DXA total fat, JA+White, RF-imputed")
 dev.off()


 ###############################################################################
 ################## combined,  trunk to periphery ratio, imputed ###############
 ###############################################################################

 set.seed(4)
 rimp.trperi.rf <- randomForest(
 	trunk_peri~., 
 	data=rimp.trperi, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(rimp.trperi.rf)
 rfr2(rimp.trperi.rf)
 plot(rimp.trperi.rf)
 impplot(rimp.trperi.rf, "Trunk:Peri, JA+White, RF-imputed")

 #What was the actual importance?
 myimp <- as.data.frame(importance(rimp.trperi.rf))
 myimp$var <- row.names(myimp)
 row.names(myimp) <- NULL
 subset(myimp, var=="ethnicg2")

 #plot the importance scores saving to file.
 png("trunkperi-combined-imputed.png", w=600, h=600)
 impplot(rimp.trperi.rf, "Trunk:Peri, JA+White, RF-imputed")
 dev.off()

 # it seems like ethnicity is no longer important. what happens 
 # if you fit a linear model with WHR? ethnicity is no longer sig!
 summary(lm(trunk_peri~ethnicg2, data=rimp.trperi))
 summary(lm(trunk_peri~ethnicg2+waisthip_tech_navel, data=rimp.trperi))

 ###############################################################################

 #what happens if you run the random forests model without allowing WHR?
 set.seed(5)
 rimp.trperi.rf.nowhr <- randomForest(
 	trunk_peri~., 
 	data=rimp.trperi[which(names(trperi) %nin% "waisthip_tech_navel")], 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(rimp.trperi.rf.nowhr)
 rfr2(rimp.trperi.rf.nowhr)
 impplot(rimp.trperi.rf.nowhr, "Trunk:Peri, JA+White, RF-imputed, excl WHR")

 png("trunkperi-combined-imputed-exclwhr.png", w=600, h=600)
 impplot(rimp.trperi.rf.nowhr, "Trunk:Peri, JA+White, RF-imputed, excl WHR")
 dev.off()

 # if you account for some of the other important variables is ethnicity still important?
 summary(lm(trunk_peri~ethnicg2, data=rimp.trperi))
 summary(lm(trunk_peri~ethnicg2+CRC_TECH_BMI+ALSR_Insulin+ALSR_PAI1+lep_adipo+ALSR_25D3+ALSR_Glucose+ALSR_TG+HOMA_IR+ALSR_HDL, data=rimp.trperi))


 ###############################################################################
 ################## totfat and trperi, with blood markers only##################
 ###############################################################################

 set.seed(6)
 rimp.totfatb.rf <- randomForest(
 	CRC_FAT_TOT~., 
 	data=rimp.totfatb, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(rimp.totfatb.rf)
 rfr2(rimp.totfatb.rf)
 impplot(rimp.totfatb.rf, "TotFat, JA+W, bloodmrks, RFimputed")

 png("totfat-combined-imputed-bloodmarkersonly.png", w=600, h=600)
 impplot(rimp.totfatb.rf, "TotFat, JA+W, bldmrks, RFimputed")
 dev.off()


 set.seed(7)
 rimp.trperib.rf <- randomForest(
 	trunk_peri~., 
 	data=rimp.trperib, 
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )

 print(rimp.trperib.rf)
 rfr2(rimp.trperib.rf)
 impplot(rimp.trperib.rf, "TotFat, JA+W, bloodmrks, RFimputed")

 png("trunkperi-combined-imputed-bloodmarkersonly.png", w=600, h=600)
 impplot(rimp.trperib.rf, "Trunk:Peri, JA+W, bldmrks, RFimputed")
 dev.off()



 ###############################################################################
 ################## totfat and trperi, splitting by ethnicity ##################
 ###############################################################################


 # The first thing I wanted to to is to make sure that how I've set up 
 # training and testing works properly. To do this, I took one subset of 
 # the data (white women only), and used that dataset for BOTH training and 
 # testing. Doing this, the testing R² should be extremely high. Then I'll 
 # do the same thing, but permute (shuffle) the outcome variable for the 
 # white women in the testing set. The OOB training R² should still be 
 # high, but the testing R² should plummet to near zero. 

 # train on white, test on white
 set.seed(8)
 rimp.totfatb.rf.split.ww <- randomForest(
 	x=rimp.totfatb.w[ ,-1], #exclude the first column, the outcome
 	y=rimp.totfatb.w[ , 1], #include only the first column, the outcome
 	xtest=rimp.totfatb.w[ ,-1],
 	ytest=rimp.totfatb.w[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.totfatb.rf.split.ww)

 # train on white, test on permutedwhite
 set.seed(9)
 rimp.totfatb.rf.split.wpw <- randomForest(
 	x=rimp.totfatb.w[ ,-1],
 	y=rimp.totfatb.w[ , 1],
 	xtest=rimp.totfatb.w[ ,-1],
 	ytest=permute(rimp.totfatb.w, "CRC_FAT_TOT")[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.totfatb.rf.split.wpw)

 ###############################################################################

 ## Now, train on one ethnicity, test on another:
 # train on white, test on JA, totfat, blood markers only:
 set.seed(10)
 rimp.totfatb.rf.split.wj <- randomForest(
 	x=rimp.totfatb.w[ ,-1], #exclude the first column, the outcome
 	y=rimp.totfatb.w[ , 1], #include only the first column, the outcome
 	xtest=rimp.totfatb.j[ ,-1],
 	ytest=rimp.totfatb.j[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.totfatb.rf.split.wj)
 impplot(rimp.totfatb.rf.split.wj, "TotFat, train:W, test:JA, RF-imputed, bldmrks")


 # train on white, test on JA, totfat, all markers except H19 and SLC25A:
 set.seed(100)
 rimp.totfat.rf.split.wj <- randomForest(
 	x=rimp.totfat.w[ ,-1], #exclude the first column, the outcome
 	y=rimp.totfat.w[ , 1], #include only the first column, the outcome
 	xtest=rimp.totfat.j[ ,-1],
 	ytest=rimp.totfat.j[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.totfat.rf.split.wj)
 impplot(rimp.totfat.rf.split.wj, "TotFat, train:W, test:JA, RF-imputed, bldmrks")




 # train on JA, test on white, totfat:
 set.seed(10)
 rimp.totfatb.rf.splitjw<- randomForest(
 	x=rimp.totfatb.j[ ,-1], #exclude the first column, the outcome
 	y=rimp.totfatb.j[ , 1], #include only the first column, the outcome
 	xtest=rimp.totfatb.w[ ,-1],
 	ytest=rimp.totfatb.w[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.totfatb.rf.split.jw)
 impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA, test:W, RF-imputed, bldmrks")


 # train on white, test on JA, trperi:
 set.seed(12)
 rimp.trperib.rf.split.wj <- randomForest(
 	x=rimp.trperib.w[ ,-1], #exclude the first column, the outcome
 	y=rimp.trperib.w[ , 1], #include only the first column, the outcome
 	xtest=rimp.trperib.j[ ,-1],
 	ytest=rimp.trperib.j[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.trperib.rf.split.wj)
 impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA, RF-imputed, bldmrks")

 # train on JA, test on white, trperi:
 set.seed(13)
 rimp.trperib.rf.split.jw <- randomForest(
 	x=rimp.trperib.j[ ,-1], #exclude the first column, the outcome
 	y=rimp.trperib.j[ , 1], #include only the first column, the outcome
 	xtest=rimp.trperib.w[ ,-1],
 	ytest=rimp.trperib.w[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.trperib.rf.split.jw)
 impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W, RF-imputed, bldmrks")


 # train on JA, test on white, trperi, all markers except H19 and HLC2A5:
 set.seed(130)
 rimp.trperi.rf.split.jw <- randomForest(
 	x=rimp.trperi.j[ ,-1], #exclude the first column, the outcome
 	y=rimp.trperi.j[ , 1], #include only the first column, the outcome
 	xtest=rimp.trperi.w[ ,-1],
 	ytest=rimp.trperi.w[ , 1],
 	importance=TRUE, 
 	keep.forest=TRUE,
 	na.action=na.omit
 )
 print(rimp.trperi.rf.split.jw)
 impplot(rimp.trperi.rf.split.jw, "Tr/peri, train:JA, test:W, RF-imputed, bldmrks")






 # Totfat, train on white, test on JA
 print(rimp.totfatb.rf.split.wj)

 # Totfat, train on JA, test on white
 print(rimp.totfatb.rf.split.jw)

 # Trunk/peri train on white, test on JA
 print(rimp.trperib.rf.split.wj)

 # Trunk/peri train on JA, test on white
 print(rimp.trperib.rf.split.jw)

 png("totfat-WJ.png", w=600, h=600)
 impplot(rimp.totfatb.rf.split.wj, "TotFat,  train:W, test:JA")
 dev.off()

 png("totfat-JW.png", w=600, h=600)
 impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA,  test:W")
 dev.off()

 png("trperi-WJ.png", w=600, h=600)
 impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA")
 dev.off()

 png("trperi-JW.png", w=600, h=600)
 impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W")
 dev.off()

 png("both-totfat-trperi-wj-jw.png", w=1200, h=1200)
 par(mfrow=c(2,2))
 impplot(rimp.totfatb.rf.split.wj, "TotFat,  train:W, test:JA")
 impplot(rimp.trperib.rf.split.wj, "Tr/peri, train:W, test:JA")
 impplot(rimp.totfatb.rf.split.jw, "TotFat, train:JA,  test:W")
 impplot(rimp.trperib.rf.split.jw, "Tr/peri, train:JA, test:W")
 dev.off()



 ################

 #randomization

 #try fitting a stepwise model. The predictive R2 is way worse.
 lmfit <- lm(CRC_FAT_TOT~., data=totfat1)
 stepfit <- step(lmfit, direction="forward")
 steppred <- predict(stepfit, totfat2[ ,-1])
 rsq(steppred,totfat2[ ,1])

 #now fit the random forest model
 set.seed(14)
 totfat.rf.split12 <- randomForest(
 	x=totfat1[ ,-1], #exclude the first column, the outcome
 	y=totfat1[ , 1], #include only the first column, the outcome
 	xtest=totfat2[ ,-1],
 	ytest=totfat2[ , 1],
 	keep.forest=TRUE,
 	importance=TRUE, 
 )
 print(totfat.rf.split12)
 rsq(predict(totfat.rf.split12, totfat2[ ,-1]), totfat2[ ,1])
 png("totfat-15split.png",w=600, h=600)
 impplot(totfat.rf.split12, "Total fat, 15/15 random")
 dev.off()

 set.seed(15)
 trperi.rf.split12 <- randomForest(
 	x=trperi1[ ,-1], #exclude the first column, the outcome
 	y=trperi1[ , 1], #include only the first column, the outcome
 	xtest=trperi2[ ,-1],
 	ytest=trperi2[ , 1],
 	keep.forest=TRUE,
 	importance=TRUE, 
 )
 print(trperi.rf.split12)
 rsq(predict(trperi.rf.split12, trperi2[ ,-1]), trperi2$trunk_peri)
 png("trperi-15split.png",w=600, h=600)
 impplot(trperi.rf.split12, "Trunk-peri, 15/15 random")
 dev.off()


 ###################################################

 # logliver
 set.seed(15)
 liver.rf.split12 <- randomForest(
 	x=liver1[ ,-1], 
 	y=liver1[ , 1], 
 	xtest=liver2[ ,-1],
 	ytest=liver2[ , 1],
 	keep.forest=TRUE,
 	importance=TRUE, 
 )
 print(liver.rf.split12)
 rsq(predict(liver.rf.split12, liver2[ ,-1]), liver2[ ,1])
 png("liver-split.png",w=600, h=600)
 impplot(liver.rf.split12, "logliver, 14w/10j split")
 dev.off()

 # visceral
 set.seed(16)
 visc.rf.split12 <- randomForest(
 	x=visc1[ ,-1], 
 	y=visc1[ , 1], 
 	xtest=visc2[ ,-1],
 	ytest=visc2[ , 1],
 	keep.forest=TRUE,
 	importance=TRUE, 
 )
 print(visc.rf.split12)
 rsq(predict(visc.rf.split12, visc2[ ,-1]), visc2[ ,1])
 png("visc-split.png",w=600, h=600)
 impplot(visc.rf.split12, "%visceral, 14w/10j split")
 dev.off()

 # visceral/subc ratio
 set.seed(16)
 viscsub.rf.split12 <- randomForest(
 	x=viscsub1[ ,-1], 
 	y=viscsub1[ , 1], 
 	xtest=viscsub2[ ,-1],
 	ytest=viscsub2[ , 1],
 	keep.forest=TRUE,
 	importance=TRUE, 
 )
 print(viscsub.rf.split12)
 rsq(predict(viscsub.rf.split12, viscsub2[ ,-1]), viscsub2[ ,1])
 png("viscsub-split.png",w=600, h=600)
 impplot(viscsub.rf.split12, "visceral/subratio, 14w/10j split")
 dev.off()