davetang · March 23, 2017 15:23
diff --git a/get_sequence.R b/get_sequence.R
 #I want to fetch sequences from
 #my_random_loci and my_refseq_tss
 head(my_random_loci,2)
    chr    start      end strand
 1 chr18 59415403 59415407      +
 2 chr22  8535632  8535636      -

 #install if necessary
 source("http://bioconductor.org/biocLite.R")
 biocLite("BSgenome.Hsapiens.UCSC.hg19")

 #load if necessary
 library(BSgenome.Hsapiens.UCSC.hg19)

 #obtain the sequences
 my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
                             names=my_random_loci$chr,
                             start=my_random_loci$start,
                             end=my_random_loci$end,
                             strand=my_random_loci$strand)

 head(my_random_loci_seq)
 #  A DNAStringSet instance of length 6
 #    width seq
 #[1]     5 CTCAG
 #[2]     5 NNNNN
 #[3]     5 CCTCA
 #[4]     5 AGAAA
 #[5]     5 TAATT
 #[6]     5 CCCTC

 #I don't want entries with N's
 #how many are there?
 length(grep(pattern="N",x=my_random_loci_seq))
 #[1] 5150

 #remove them
 my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)]

 #do the same for my_refseq_tss
 my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
                            names=my_refseq_tss$chromosome_name,
                            start=my_refseq_tss$transcript_start,
                            end=my_refseq_tss$transcript_end,
                            strand=my_refseq_tss$strand)

 head(my_refseq_tss_seq)
 #  A DNAStringSet instance of length 6
 #    width seq
 #[1]     5 TTGTT
 #[2]     5 GGGAT
 #[3]     5 GGCAC
 #[4]     5 CGCTG
 #[5]     5 CAGCA
 #[6]     5 GTGGC

 #calculate the dinucleotide frequency
 my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq)
 colSums(my_random_loci_seq_di)
 #   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG    GT    TA    TC 
 #14142  7386  9968 11071 10667  7813  1401 10098  8640  6150  7471  7147  9097  8622 
 #   TG    TT 
 #10424 13611

 my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq)
 colSums(my_refseq_tss_seq_di)
 #   AA    AC    AG    AT    CA    CC    CG    CT    GA    GC    GG    GT    TA    TC 
 # 5868  5941 10382  5275 10694 11895  9787  8117  8649 13169 13423  6469  3493  7238 
 #   TG    TT 
 # 8390  5546

 #how about the number of combinations?
 my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse='')))
 head(my_random_loci_seq_di_comb)
 #
 #0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003 
 #             234               74               81              110               70 
 #0000000000010012 
 #             231

 head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)])
 #
 #2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000 
 #             395              393              371              341              287 
 #1001000000001001 
 #             275

 length(unique(my_random_loci_seq_di_comb))
 #[1] 158

 #combinations for RefSeq
 my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse='')))
 head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)])
 #
 #0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000 
 #             627              400              397              363              338 
 #0001110000000010 
 #             334

 length(unique(my_refseq_tss_seq_di_comb))
 [1] 156

 #not run
 #par(mfrow=c(2,1))
 #plot(density(my_refseq_tss_seq_di_comb))
 #plot(density(my_random_loci_seq_di_comb))
 #par(mfrow=c(1,1))
	#I want to fetch sequences from
	#my_random_loci and my_refseq_tss
	head(my_random_loci,2)
	chr start end strand
	1 chr18 59415403 59415407 +
	2 chr22 8535632 8535636 -

	#install if necessary
	source("http://bioconductor.org/biocLite.R")
	biocLite("BSgenome.Hsapiens.UCSC.hg19")

	#load if necessary
	library(BSgenome.Hsapiens.UCSC.hg19)

	#obtain the sequences
	my_random_loci_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
	names=my_random_loci$chr,
	start=my_random_loci$start,
	end=my_random_loci$end,
	strand=my_random_loci$strand)

	head(my_random_loci_seq)
	# A DNAStringSet instance of length 6
	# width seq
	#[1] 5 CTCAG
	#[2] 5 NNNNN
	#[3] 5 CCTCA
	#[4] 5 AGAAA
	#[5] 5 TAATT
	#[6] 5 CCCTC

	#I don't want entries with N's
	#how many are there?
	length(grep(pattern="N",x=my_random_loci_seq))
	#[1] 5150

	#remove them
	my_random_loci_seq <- my_random_loci_seq[-grep(pattern="N",x=my_random_loci_seq)]

	#do the same for my_refseq_tss
	my_refseq_tss_seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19,
	names=my_refseq_tss$chromosome_name,
	start=my_refseq_tss$transcript_start,
	end=my_refseq_tss$transcript_end,
	strand=my_refseq_tss$strand)

	head(my_refseq_tss_seq)
	# A DNAStringSet instance of length 6
	# width seq
	#[1] 5 TTGTT
	#[2] 5 GGGAT
	#[3] 5 GGCAC
	#[4] 5 CGCTG
	#[5] 5 CAGCA
	#[6] 5 GTGGC

	#calculate the dinucleotide frequency
	my_random_loci_seq_di <- dinucleotideFrequency(my_random_loci_seq)
	colSums(my_random_loci_seq_di)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
	#14142 7386 9968 11071 10667 7813 1401 10098 8640 6150 7471 7147 9097 8622
	# TG TT
	#10424 13611

	my_refseq_tss_seq_di <- dinucleotideFrequency(my_refseq_tss_seq)
	colSums(my_refseq_tss_seq_di)
	# AA AC AG AT CA CC CG CT GA GC GG GT TA TC
	# 5868 5941 10382 5275 10694 11895 9787 8117 8649 13169 13423 6469 3493 7238
	# TG TT
	# 8390 5546

	#how about the number of combinations?
	my_random_loci_seq_di_comb <- table(apply(my_random_loci_seq_di, 1, function(x) paste(x, collapse='')))
	head(my_random_loci_seq_di_comb)
	#
	#0000000000000004 0000000000000013 0000000000000103 0000000000001003 0000000000010003
	# 234 74 81 110 70
	#0000000000010012
	# 231

	head(my_random_loci_seq_di_comb[order(my_random_loci_seq_di_comb, decreasing=T)])
	#
	#2001000000001000 0001000000001002 2010000010000000 0000000100000102 2100100000000000
	# 395 393 371 341 287
	#1001000000001001
	# 275

	length(unique(my_random_loci_seq_di_comb))
	#[1] 158

	#combinations for RefSeq
	my_refseq_tss_seq_di_comb <- table(apply(my_refseq_tss_seq_di, 1, function(x) paste(x, collapse='')))
	head(my_refseq_tss_seq_di_comb[order(my_refseq_tss_seq_di_comb, decreasing=T)])
	#
	#0000001001200000 0000021001000000 0010000010200000 0000011001100000 0010110001000000
	# 627 400 397 363 338
	#0001110000000010
	# 334

	length(unique(my_refseq_tss_seq_di_comb))
	[1] 156

	#not run
	#par(mfrow=c(2,1))
	#plot(density(my_refseq_tss_seq_di_comb))
	#plot(density(my_random_loci_seq_di_comb))
	#par(mfrow=c(1,1))