Skip to content

Instantly share code, notes, and snippets.

@davetang
Last active December 23, 2015 01:49
Show Gist options
  • Save davetang/6562844 to your computer and use it in GitHub Desktop.
Save davetang/6562844 to your computer and use it in GitHub Desktop.
From two sets of dinucleotide counts, use random forests to create a predictor
#install if necessary
install.packages("randomForest")
#load library
library(randomForest)
#I have two sets of dinucleotide counts stored in
#my_random_loci_seq_di and my_refseq_tss_seq_di
head(my_refseq_tss_seq_di,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#[1,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2
#[2,] 0 0 0 1 0 0 0 0 1 0 2 0 0 0 0 0
head(my_random_loci_seq_di,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
#[1,] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0
#[2,] 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0
#sample a set for training from the refseq set
set.seed(23456)
my_refseq_sample_index <- sample(
x=1:nrow(my_refseq_tss_seq_di),
size=round(nrow(my_refseq_tss_seq_di)/2),
replace=F)
head(my_refseq_sample_index)
#[1] 15687 25731 23143 30129 26312 15763
#store the sample
my_refseq_sample <- as.data.frame(my_refseq_tss_seq_di[my_refseq_sample_index,])
#create tss class
my_refseq_sample$class <- rep('tss', round(nrow(my_refseq_tss_seq_di)/2))
head(my_refseq_sample,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss
#sample a set for training from the random set
set.seed(23456)
my_random_sample_index <- sample(
x=1:nrow(my_random_loci_seq_di),
size=round(nrow(my_random_loci_seq_di)/2),
replace=F)
#store the sample
my_random_sample <- as.data.frame(my_random_loci_seq_di[my_random_sample_index,])
#create random class
my_random_sample$class <- rep('random',round(nrow(my_random_loci_seq_di)/2))
#combine the two samples
train <- rbind(my_refseq_sample,my_random_sample)
#set the class as a factor
train$class <- as.factor(train$class)
#check out train
dim(train)
#[1] 34756 17
head(train,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 tss
#2 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 tss
tail(train,2)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT class
#34755 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 random
#34756 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2 0 random
#training
r <- randomForest(class ~ ., data=train, importance=TRUE, do.trace=100)
#ntree OOB 1 2
# 100: 30.64% 25.14% 36.52%
# 200: 30.51% 24.96% 36.46%
# 300: 30.40% 24.85% 36.33%
# 400: 30.47% 24.98% 36.34%
# 500: 30.47% 24.89% 36.44%
r
#Call:
# randomForest(formula = class ~ ., data = train, importance = TRUE, do.trace = 100)
# Type of random forest: classification
# Number of trees: 500
#No. of variables tried at each split: 4
#
# OOB estimate of error rate: 30.47%
#Confusion matrix:
# random tss class.error
#random 13493 4471 0.2488867
#tss 6119 10673 0.3643997
#install if necessary
install.packages('pROC')
#load library
library(pROC)
#check roc
roc(r$y, r$votes)
#Call:
#roc.default(response = r$y, predictor = r$votes)
#
#Data: r$votes in 35928 controls (r$y random) < 33584 cases (r$y tss).
#Area under the curve: 0.5
#check ci
ci.auc(r$y, r$votes)
#check importance
rn <- round(importance(r), 2)
rn[order(rn[,3], decreasing=TRUE),]
# random tss MeanDecreaseAccuracy MeanDecreaseGini
#CG 96.82 32.57 76.12 1143.27
#TG 15.52 43.58 69.24 186.87
#GG -0.22 51.59 61.87 211.40
#GC 39.09 24.54 59.69 391.97
#CT 19.01 28.02 58.98 127.87
#CA -27.72 51.29 58.50 152.47
#CC 6.32 45.02 57.33 191.00
#AC 8.66 33.77 55.31 102.66
#GT 13.95 28.26 55.19 111.89
#AT -16.42 55.89 54.25 206.15
#TC 23.27 17.44 53.59 105.38
#TA -16.51 58.66 53.50 261.05
#AA 23.41 34.41 52.59 311.58
#AG -14.48 38.80 51.53 119.44
#GA -14.70 36.49 46.55 84.82
#TT 12.32 38.70 43.29 274.56
#plot ROC
#install if necessary
install.packages('verification')
#load library
library(verification)
aucc <- roc.area(as.integer(train$class=='tss'), r$votes[,2])$A
roc.plot(as.integer(train$class=='tss'), r$votes[,2], main="")
legend("bottomright", bty="n", sprintf("Area Under the Curve (AUC) = %1.3f", aucc))
title(main="OOB ROC Curve Random Forest for predicting TSS")
#creating the test set
my_random_test <- as.data.frame(my_random_loci_seq_di[-my_random_sample_index,])
my_random_test$class <- rep('random',nrow(my_random_test))
my_refseq_test <- as.data.frame(my_refseq_tss_seq_di[-my_refseq_sample_index,])
my_refseq_test$class <- rep('tss',nrow(my_refseq_test))
test <- rbind(my_refseq_test,my_random_test)
test$class <- as.factor(test$class)
dim(test)
#[1] 34755 17
data.predict <- predict(r, test)
t = table(observed=test[,'class'], predict=data.predict)
prop.table(t,1)
# predict
#observed random tss
# random 0.7639592 0.2360408
# tss 0.3632087 0.6367913
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment