davetang · December 23, 2015 01:49
diff --git a/random_forest.R b/random_forest.R
 #install if necessary
 install.packages("randomForest")

 #load library
 library(randomForest)

 #I have two sets of dinucleotide counts stored in
 #my_random_loci_seq_di and my_refseq_tss_seq_di

 head(my_refseq_tss_seq_di,2)
 #     AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
 #[1,]  0  0  0  0  0  0  0  0  0  0  0  1  0  0  1  2
 #[2,]  0  0  0  1  0  0  0  0  1  0  2  0  0  0  0  0

 head(my_random_loci_seq_di,2)
 #     AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
 #[1,]  0  0  1  0  1  0  0  1  0  0  0  0  0  1  0  0
 #[2,]  0  0  0  0  1  1  0  1  0  0  0  0  0  1  0  0

 #sample a set for training from the refseq set
 set.seed(23456)
 my_refseq_sample_index <- sample(
   x=1:nrow(my_refseq_tss_seq_di),
   size=round(nrow(my_refseq_tss_seq_di)/2),
   replace=F)

 head(my_refseq_sample_index)
 #[1] 15687 25731 23143 30129 26312 15763

 #store the sample
 my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,])

 #create tss class
 my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2))

 head(my_refseq_sample,2)
 #  AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
 #1  0  0  0  0  0  0  1  1  0  1  0  0  0  1  0  0   tss
 #2  1  1  0  0  0  0  0  1  0  0  0  0  0  0  0  1   tss

 #sample a set for training from the random set
 set.seed(23456)
 my_random_sample_index <- sample(
   x=1:nrow(my_random_loci_seq_di),
   size=round(nrow(my_random_loci_seq_di)/2),
   replace=F)

 #store the sample
 my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,])

 #create random class
 my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2))

 #combine the two samples
 train <- rbind(my_refseq_sample,my_random_sample)

 #set the class as a factor
 train$class <- as.factor(train$class)

 #check out train
 dim(train)
 #[1] 34756    17

 head(train,2)
 #  AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
 #1  0  0  0  0  0  0  1  1  0  1  0  0  0  1  0  0   tss
 #2  1  1  0  0  0  0  0  1  0  0  0  0  0  0  0  1   tss

 tail(train,2)
 #      AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT  class
 #34755  0  0  1  0  0  0  0  0  1  1  1  0  0  0  0  0 random
 #34756  0  0  0  0  0  0  0  0  0  1  0  1  0  0  2  0 random

 #training
 r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100)
 #ntree      OOB      1      2
 #  100:  30.64% 25.14% 36.52%
 #  200:  30.51% 24.96% 36.46%
 #  300:  30.40% 24.85% 36.33%
 #  400:  30.47% 24.98% 36.34%
 #  500:  30.47% 24.89% 36.44%

 r

 #Call:
 # randomForest(formula = class ~ ., data = train, importance = TRUE,      do.trace = 100) 
 #               Type of random forest: classification
 #                     Number of trees: 500
 #No. of variables tried at each split: 4
 #
 #        OOB estimate of  error rate: 30.47%
 #Confusion matrix:
 #       random   tss class.error
 #random  13493  4471   0.2488867
 #tss      6119 10673   0.3643997

 #install if necessary
 install.packages('pROC')

 #load library
 library(pROC)

 #check roc
 roc(r$y, r$votes)

 #Call:
 #roc.default(response = r$y, predictor = r$votes)
 #
 #Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss).
 #Area under the curve: 0.5

 #check ci
 ci.auc(r$y, r$votes)

 #check importance
 rn <- round(importance(r), 2)
 rn[order(rn[,3], decreasing=TRUE),]
 #   random   tss MeanDecreaseAccuracy MeanDecreaseGini
 #CG  96.82 32.57                76.12          1143.27
 #TG  15.52 43.58                69.24           186.87
 #GG  -0.22 51.59                61.87           211.40
 #GC  39.09 24.54                59.69           391.97
 #CT  19.01 28.02                58.98           127.87
 #CA -27.72 51.29                58.50           152.47
 #CC   6.32 45.02                57.33           191.00
 #AC   8.66 33.77                55.31           102.66
 #GT  13.95 28.26                55.19           111.89
 #AT -16.42 55.89                54.25           206.15
 #TC  23.27 17.44                53.59           105.38
 #TA -16.51 58.66                53.50           261.05
 #AA  23.41 34.41                52.59           311.58
 #AG -14.48 38.80                51.53           119.44
 #GA -14.70 36.49                46.55            84.82
 #TT  12.32 38.70                43.29           274.56

 #plot ROC
 #install if necessary
 install.packages('verification')

 #load library
 library(verification)
 aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A
 roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="")
 legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
 title(main="OOB ROC Curve Random Forest for predicting TSS")

 #creating the test set
 my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,])
 my_random_test$class <- rep('random',nrow(my_random_test))

 my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,])
 my_refseq_test$class <- rep('tss',nrow(my_refseq_test))

 test <- rbind(my_refseq_test,my_random_test)
 test$class <- as.factor(test$class)
 dim(test)
 #[1] 34755    17

 data.predict <- predict(r, test)
 t = table(observed=test[,'class'], predict=data.predict)
 prop.table(t,1)
 #        predict
 #observed    random       tss
 #  random 0.7639592 0.2360408
 #  tss    0.3632087 0.6367913
	#install if necessary
	install.packages("randomForest")

	#load library
	library(randomForest)

	#I have two sets of dinucleotide counts stored in
	#my_random_loci_seq_di and my_refseq_tss_seq_di

	head(my_refseq_tss_seq_di,2)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
	#[1,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2
	#[2,] 0 0 0 1 0 0 0 0 1 0 2 0 0 0 0 0

	head(my_random_loci_seq_di,2)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
	#[1,] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0
	#[2,] 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0

	#sample a set for training from the refseq set
	set.seed(23456)
	my_refseq_sample_index <- sample(
	x=1:nrow(my_refseq_tss_seq_di),
	size=round(nrow(my_refseq_tss_seq_di)/2),
	replace=F)

	head(my_refseq_sample_index)
	#[1] 15687 25731 23143 30129 26312 15763

	#store the sample
	my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,])

	#create tss class
	my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2))

	head(my_refseq_sample,2)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
	#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
	#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss

	#sample a set for training from the random set
	set.seed(23456)
	my_random_sample_index <- sample(
	x=1:nrow(my_random_loci_seq_di),
	size=round(nrow(my_random_loci_seq_di)/2),
	replace=F)

	#store the sample
	my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,])

	#create random class
	my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2))

	#combine the two samples
	train <- rbind(my_refseq_sample,my_random_sample)

	#set the class as a factor
	train$class <- as.factor(train$class)

	#check out train
	dim(train)
	#[1] 34756 17

	head(train,2)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
	#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
	#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss

	tail(train,2)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
	#34755 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 random
	#34756 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2 0 random

	#training
	r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100)
	#ntree OOB 1 2
	# 100: 30.64% 25.14% 36.52%
	# 200: 30.51% 24.96% 36.46%
	# 300: 30.40% 24.85% 36.33%
	# 400: 30.47% 24.98% 36.34%
	# 500: 30.47% 24.89% 36.44%

	r

	#Call:
	# randomForest(formula = class ~ ., data = train, importance = TRUE, do.trace = 100)
	# Type of random forest: classification
	# Number of trees: 500
	#No. of variables tried at each split: 4
	#
	# OOB estimate of error rate: 30.47%
	#Confusion matrix:
	# random tss class.error
	#random 13493 4471 0.2488867
	#tss 6119 10673 0.3643997

	#install if necessary
	install.packages('pROC')

	#load library
	library(pROC)

	#check roc
	roc(r$y, r$votes)

	#Call:
	#roc.default(response = r$y, predictor = r$votes)
	#
	#Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss).
	#Area under the curve: 0.5

	#check ci
	ci.auc(r$y, r$votes)

	#check importance
	rn <- round(importance(r), 2)
	rn[order(rn[,3], decreasing=TRUE),]
	# random tss MeanDecreaseAccuracy MeanDecreaseGini
	#CG 96.82 32.57 76.12 1143.27
	#TG 15.52 43.58 69.24 186.87
	#GG -0.22 51.59 61.87 211.40
	#GC 39.09 24.54 59.69 391.97
	#CT 19.01 28.02 58.98 127.87
	#CA -27.72 51.29 58.50 152.47
	#CC 6.32 45.02 57.33 191.00
	#AC 8.66 33.77 55.31 102.66
	#GT 13.95 28.26 55.19 111.89
	#AT -16.42 55.89 54.25 206.15
	#TC 23.27 17.44 53.59 105.38
	#TA -16.51 58.66 53.50 261.05
	#AA 23.41 34.41 52.59 311.58
	#AG -14.48 38.80 51.53 119.44
	#GA -14.70 36.49 46.55 84.82
	#TT 12.32 38.70 43.29 274.56

	#plot ROC
	#install if necessary
	install.packages('verification')

	#load library
	library(verification)
	aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A
	roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="")
	legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
	title(main="OOB ROC Curve Random Forest for predicting TSS")

	#creating the test set
	my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,])
	my_random_test$class <- rep('random',nrow(my_random_test))

	my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,])
	my_refseq_test$class <- rep('tss',nrow(my_refseq_test))

	test <- rbind(my_refseq_test,my_random_test)
	test$class <- as.factor(test$class)
	dim(test)
	#[1] 34755 17

	data.predict <- predict(r, test)
	t = table(observed=test[,'class'], predict=data.predict)
	prop.table(t,1)
	# predict
	#observed random tss
	# random 0.7639592 0.2360408
	# tss 0.3632087 0.6367913