Last active
April 17, 2018 20:56
-
-
Save davetang/6548010 to your computer and use it in GitHub Desktop.
Sampling random regions of the hg19 genome and obtaining the corresponding sequence
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#how many regions to sample? | |
number <- 50000 | |
#how many bp to add to start | |
span <- 4 | |
#some necessary packages | |
#install if necessary | |
source("http://bioconductor.org/biocLite.R") | |
biocLite("BSgenome") | |
#load library | |
library(BSgenome) | |
#find available genomes | |
available.genomes() | |
#I'm interested in hg19 | |
biocLite("BSgenome.Hsapiens.UCSC.hg19") | |
library(BSgenome.Hsapiens.UCSC.hg19) | |
#chromosomes of interest | |
my_chr <- c(1:22,'X','Y') | |
my_chr <- gsub(pattern="^", replacement='chr', my_chr) | |
#initialise list to store chromosome sizes | |
my_chr_size <- list() | |
for (i in my_chr){ | |
my_chr_size[[i]] <- length(BSgenome.Hsapiens.UCSC.hg19[[i]]) | |
} | |
#checkout my_chr_size | |
head(my_chr_size,2) | |
#$chr1 | |
#[1] 249250621 | |
# | |
#$chr2 | |
#[1] 243199373 | |
#initialise some vectors for storing random coordinates | |
my_random_start <- vector() | |
my_random_end <- vector() | |
my_random_chr <- vector() | |
my_random_strand <- vector() | |
set.seed(12345) | |
#loop through number of regions | |
for(i in 1:number){ | |
my_random_chr[i] <- sample(x=my_chr,size=1) | |
my_random_strand[i] <- sample(x=c('-','+'),size=1) | |
my_max <- my_chr_size[[my_random_chr[i]]]-span | |
my_random_start[i] <- runif(n=1, min=1, max=my_max) | |
my_random_end[i] <- my_random_start[i] + span | |
} | |
my_random_loci <- data.frame(chr=my_random_chr, | |
start=my_random_start, | |
end=my_random_end, | |
strand=my_random_strand) | |
head(my_random_loci) | |
# chr start end strand | |
#1 chr18 59415403 59415407 + | |
#2 chr22 8535632 8535636 - | |
#3 chr8 106509865 106509869 + | |
#4 chrY 9046958 9046962 - | |
#5 chr18 30544079 30544083 - | |
#6 chr12 53873398 53873402 - |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment