Last active
December 23, 2015 01:49
-
-
Save davetang/6562844 to your computer and use it in GitHub Desktop.
From two sets of dinucleotide counts, use random forests to create a predictor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install if necessary | |
install.packages("randomForest") | |
#load library | |
library(randomForest) | |
#I have two sets of dinucleotide counts stored in | |
#my_random_loci_seq_di and my_refseq_tss_seq_di | |
head(my_refseq_tss_seq_di,2) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT | |
#[1,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 | |
#[2,] 0 0 0 1 0 0 0 0 1 0 2 0 0 0 0 0 | |
head(my_random_loci_seq_di,2) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT | |
#[1,] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 | |
#[2,] 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 | |
#sample a set for training from the refseq set | |
set.seed(23456) | |
my_refseq_sample_index <- sample( | |
x=1:nrow(my_refseq_tss_seq_di), | |
size=round(nrow(my_refseq_tss_seq_di)/2), | |
replace=F) | |
head(my_refseq_sample_index) | |
#[1] 15687 25731 23143 30129 26312 15763 | |
#store the sample | |
my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,]) | |
#create tss class | |
my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2)) | |
head(my_refseq_sample,2) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss | |
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss | |
#sample a set for training from the random set | |
set.seed(23456) | |
my_random_sample_index <- sample( | |
x=1:nrow(my_random_loci_seq_di), | |
size=round(nrow(my_random_loci_seq_di)/2), | |
replace=F) | |
#store the sample | |
my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,]) | |
#create random class | |
my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2)) | |
#combine the two samples | |
train <- rbind(my_refseq_sample,my_random_sample) | |
#set the class as a factor | |
train$class <- as.factor(train$class) | |
#check out train | |
dim(train) | |
#[1] 34756 17 | |
head(train,2) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss | |
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss | |
tail(train,2) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class | |
#34755 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 random | |
#34756 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2 0 random | |
#training | |
r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100) | |
#ntree OOB 1 2 | |
# 100: 30.64% 25.14% 36.52% | |
# 200: 30.51% 24.96% 36.46% | |
# 300: 30.40% 24.85% 36.33% | |
# 400: 30.47% 24.98% 36.34% | |
# 500: 30.47% 24.89% 36.44% | |
r | |
#Call: | |
# randomForest(formula = class ~ ., data = train, importance = TRUE, do.trace = 100) | |
# Type of random forest: classification | |
# Number of trees: 500 | |
#No. of variables tried at each split: 4 | |
# | |
# OOB estimate of error rate: 30.47% | |
#Confusion matrix: | |
# random tss class.error | |
#random 13493 4471 0.2488867 | |
#tss 6119 10673 0.3643997 | |
#install if necessary | |
install.packages('pROC') | |
#load library | |
library(pROC) | |
#check roc | |
roc(r$y, r$votes) | |
#Call: | |
#roc.default(response = r$y, predictor = r$votes) | |
# | |
#Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss). | |
#Area under the curve: 0.5 | |
#check ci | |
ci.auc(r$y, r$votes) | |
#check importance | |
rn <- round(importance(r), 2) | |
rn[order(rn[,3], decreasing=TRUE),] | |
# random tss MeanDecreaseAccuracy MeanDecreaseGini | |
#CG 96.82 32.57 76.12 1143.27 | |
#TG 15.52 43.58 69.24 186.87 | |
#GG -0.22 51.59 61.87 211.40 | |
#GC 39.09 24.54 59.69 391.97 | |
#CT 19.01 28.02 58.98 127.87 | |
#CA -27.72 51.29 58.50 152.47 | |
#CC 6.32 45.02 57.33 191.00 | |
#AC 8.66 33.77 55.31 102.66 | |
#GT 13.95 28.26 55.19 111.89 | |
#AT -16.42 55.89 54.25 206.15 | |
#TC 23.27 17.44 53.59 105.38 | |
#TA -16.51 58.66 53.50 261.05 | |
#AA 23.41 34.41 52.59 311.58 | |
#AG -14.48 38.80 51.53 119.44 | |
#GA -14.70 36.49 46.55 84.82 | |
#TT 12.32 38.70 43.29 274.56 | |
#plot ROC | |
#install if necessary | |
install.packages('verification') | |
#load library | |
library(verification) | |
aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A | |
roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="") | |
legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc)) | |
title(main="OOB ROC Curve Random Forest for predicting TSS") | |
#creating the test set | |
my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,]) | |
my_random_test$class <- rep('random',nrow(my_random_test)) | |
my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,]) | |
my_refseq_test$class <- rep('tss',nrow(my_refseq_test)) | |
test <- rbind(my_refseq_test,my_random_test) | |
test$class <- as.factor(test$class) | |
dim(test) | |
#[1] 34755 17 | |
data.predict <- predict(r, test) | |
t = table(observed=test[,'class'], predict=data.predict) | |
prop.table(t,1) | |
# predict | |
#observed random tss | |
# random 0.7639592 0.2360408 | |
# tss 0.3632087 0.6367913 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment