Skip to content

Instantly share code, notes, and snippets.

@davetang
Last active April 17, 2018 20:56
Show Gist options
  • Save davetang/6548010 to your computer and use it in GitHub Desktop.
Save davetang/6548010 to your computer and use it in GitHub Desktop.
Sampling random regions of the hg19 genome and obtaining the corresponding sequence
#how many regions to sample?
number <- 50000
#how many bp to add to start
span <- 4
#some necessary packages
#install if necessary
source("http://bioconductor.org/biocLite.R")
biocLite("BSgenome")
#load library
library(BSgenome)
#find available genomes
available.genomes()
#I'm interested in hg19
biocLite("BSgenome.Hsapiens.UCSC.hg19")
library(BSgenome.Hsapiens.UCSC.hg19)
#chromosomes of interest
my_chr <- c(1:22,'X','Y')
my_chr <- gsub(pattern="^", replacement='chr', my_chr)
#initialise list to store chromosome sizes
my_chr_size <- list()
for (i in my_chr){
my_chr_size[[i]] <- length(BSgenome.Hsapiens.UCSC.hg19[[i]])
}
#checkout my_chr_size
head(my_chr_size,2)
#$chr1
#[1] 249250621
#
#$chr2
#[1] 243199373
#initialise some vectors for storing random coordinates
my_random_start <- vector()
my_random_end <- vector()
my_random_chr <- vector()
my_random_strand <- vector()
set.seed(12345)
#loop through number of regions
for(i in 1:number){
my_random_chr[i] <- sample(x=my_chr,size=1)
my_random_strand[i] <- sample(x=c('-','+'),size=1)
my_max <- my_chr_size[[my_random_chr[i]]]-span
my_random_start[i] <- runif(n=1, min=1, max=my_max)
my_random_end[i] <- my_random_start[i] + span
}
my_random_loci <- data.frame(chr=my_random_chr,
start=my_random_start,
end=my_random_end,
strand=my_random_strand)
head(my_random_loci)
# chr start end strand
#1 chr18 59415403 59415407 +
#2 chr22 8535632 8535636 -
#3 chr8 106509865 106509869 +
#4 chrY 9046958 9046962 -
#5 chr18 30544079 30544083 -
#6 chr12 53873398 53873402 -
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment