Last active
March 23, 2017 15:23
-
-
Save davetang/6562474 to your computer and use it in GitHub Desktop.
From a data frame with chromosomal coordinates, obtain the sequence, and calculate the dinucleotide frequencies
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I want to fetch sequences from | |
#my_random_loci and my_refseq_tss | |
head(my_random_loci,2) | |
chr start end strand | |
1 chr18 59415403 59415407 + | |
2 chr22 8535632 8535636 - | |
#install if necessary | |
source("http://bioconductor.org/biocLite.R") | |
biocLite("BSgenome.Hsapiens.UCSC.hg19") | |
#load if necessary | |
library(BSgenome.Hsapiens.UCSC.hg19) | |
#obtain the sequences | |
my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, | |
names=my_random_loci$chr, | |
start=my_random_loci$start, | |
end=my_random_loci$end, | |
strand=my_random_loci$strand) | |
head(my_random_loci_seq) | |
# A DNAStringSet instance of length 6 | |
# width seq | |
#[1] 5 CTCAG | |
#[2] 5 NNNNN | |
#[3] 5 CCTCA | |
#[4] 5 AGAAA | |
#[5] 5 TAATT | |
#[6] 5 CCCTC | |
#I don't want entries with N's | |
#how many are there? | |
length(grep(pattern="N",x=my_random_loci_seq)) | |
#[1] 5150 | |
#remove them | |
my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)] | |
#do the same for my_refseq_tss | |
my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, | |
names=my_refseq_tss$chromosome_name, | |
start=my_refseq_tss$transcript_start, | |
end=my_refseq_tss$transcript_end, | |
strand=my_refseq_tss$strand) | |
head(my_refseq_tss_seq) | |
# A DNAStringSet instance of length 6 | |
# width seq | |
#[1] 5 TTGTT | |
#[2] 5 GGGAT | |
#[3] 5 GGCAC | |
#[4] 5 CGCTG | |
#[5] 5 CAGCA | |
#[6] 5 GTGGC | |
#calculate the dinucleotide frequency | |
my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq) | |
colSums(my_random_loci_seq_di) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC | |
#14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622 | |
# TG TT | |
#10424 13611 | |
my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq) | |
colSums(my_refseq_tss_seq_di) | |
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC | |
# 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238 | |
# TG TT | |
# 8390 5546 | |
#how about the number of combinations? | |
my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse=''))) | |
head(my_random_loci_seq_di_comb) | |
# | |
#0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003 | |
# 234 74 81 110 70 | |
#0000000000010012 | |
# 231 | |
head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)]) | |
# | |
#2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000 | |
# 395 393 371 341 287 | |
#1001000000001001 | |
# 275 | |
length(unique(my_random_loci_seq_di_comb)) | |
#[1] 158 | |
#combinations for RefSeq | |
my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse=''))) | |
head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)]) | |
# | |
#0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000 | |
# 627 400 397 363 338 | |
#0001110000000010 | |
# 334 | |
length(unique(my_refseq_tss_seq_di_comb)) | |
[1] 156 | |
#not run | |
#par(mfrow=c(2,1)) | |
#plot(density(my_refseq_tss_seq_di_comb)) | |
#plot(density(my_random_loci_seq_di_comb)) | |
#par(mfrow=c(1,1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment