-
-
Save mdozmorov/33a1fa3234b2942dae238e1fcb39c996 to your computer and use it in GitHub Desktop.
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39 | |
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist | |
# software, High Signal and Low Mappability regions. | |
# See https://github.com/dozmorovlab/excluderanges for more information. | |
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr | |
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/ | |
# bedbase_id | |
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5" | |
# Construct output file name | |
fileNameOut <- "mm39.excluderanges.bed.gz" | |
# API token for BED data | |
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed") | |
# Download file | |
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE)) | |
# Read the data in | |
mm39.excluderanges <- readr::read_tsv(fileNameOut, | |
col_names = FALSE, | |
col_types = c("cddcdc")) | |
# Assign column names depending on the number of columns | |
all_columns <- c("chr", "start", "end", "name", "score", "strand", | |
"signalValue", "pValue", "qValue", "peak") | |
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)] | |
# Convert to GRanges object | |
mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges, | |
keep.extra.columns = TRUE) | |
# Seqinfo for mm39 genome | |
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39", | |
assembled.molecules.only = TRUE) | |
# Subset and match to chromosomes in the mm39.excluderanges object | |
# Common chromosomes | |
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges)) | |
# Subset mm39.excluderanges | |
mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common, | |
pruning.mode = "tidy") | |
# Subset chrom_data | |
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ] | |
# Match objects | |
chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ] | |
# Assign seqinfo data | |
seqlengths(mm39.excluderanges) <- chrom_data$size | |
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE) | |
genome(mm39.excluderanges) <- "mm39" | |
mm39.excluderanges |
Hi @kcoutinh, can you copy your complete code here? BedBase download also doesn't work, I wonder if you corrected the API.
So I actually downloaded the mm39 from the google drive provided in your lab's exclude ranges vignette: https://dozmorovlab.github.io/excluderanges/#excluderanges-genomic-ranges-of-problematic-genomic-regions
`# Construct output file name
fileNameOut <- "mm39.excluderanges.bed"
Read the data in
mm39.excluderanges <- readr::read_tsv(fileNameOut,
col_names = TRUE)
Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "strand","score",
"signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
Convert to GRanges object
mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
keep.extra.columns = TRUE)
Seqinfo for mm39 genome
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
assembled.molecules.only = TRUE)
Subset and match to chromosomes in the mm39.excluderanges object
Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
Subset mm39.excluderanges
mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
pruning.mode = "tidy")
Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
Match objects
chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
Assign seqinfo data
seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges) <- "mm39"
mm39.excluderanges
bed<- as.data.frame(mm39.excluderanges)
Rename columns
colnames(bed) <- c("chrom", "chromStart", "chromEnd", "width", "strand", "score", "name")
Reorder columns
bed_2 <- bed[, c("chrom", "chromStart", "chromEnd", "name", "score", "strand")]
write.table( bed, file = "mm39.excluderanges_2.bed",
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)`
I believe, you got the file from Google Drive? mm39.excluderanges.bed. It doesn't have column names, so should be col_names = FALSE
. Here's the head:
chr1 5494200 5506700 12501 * High Signal Region
chr1 6063300 6069400 6101 * High Signal Region
chr1 8728600 8802500 73901 * High Signal Region
Opes my bad. Here is the edited code.
`suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
library(readr)
bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
Construct output file name
fileNameOut <- "mm39.excluderanges.bed"
API token for BED data
#token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
Download file
#GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
Read the data in
mm39.excluderanges <- readr::read_tsv(fileNameOut,
col_names = FALSE)
Assign column names depending on the number of columns
all_columns <- c("chr", "start", "end", "name", "strand","score",
"signalValue", "pValue", "qValue", "peak")
colnames(mm39.excluderanges) <- all_columns[1:ncol(mm39.excluderanges)]
Convert to GRanges object
mm39.excluderanges <- makeGRangesFromDataFrame(mm39.excluderanges,
keep.extra.columns = TRUE)
Seqinfo for mm39 genome
chrom_data <- GenomeInfoDb::getChromInfoFromUCSC(genome = "mm39",
assembled.molecules.only = TRUE)
Subset and match to chromosomes in the mm39.excluderanges object
Common chromosomes
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(mm39.excluderanges))
Subset mm39.excluderanges
mm39.excluderanges <- keepSeqlevels(mm39.excluderanges, chromosomes_common,
pruning.mode = "tidy")
Subset chrom_data
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
Match objects
chrom_data <- chrom_data[match(seqlevels(mm39.excluderanges), chrom_data$chrom), ]
Assign seqinfo data
seqlengths(mm39.excluderanges) <- chrom_data$size
isCircular(mm39.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
genome(mm39.excluderanges) <- "mm39"
mm39.excluderanges
bed<- as.data.frame(mm39.excluderanges)
Rename columns
colnames(bed) <- c("chrom", "chromStart", "chromEnd", "width", "strand", "score", "name")
Reorder columns
bed_2 <- bed[, c("chrom", "chromStart", "chromEnd", "name", "score", "strand")]
write.table( bed, file = "mm39.excluderanges_2.bed",
sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)`
Hi! I tried this code and I needed to make col_names=TRUE here:
mm39.excluderanges <- readr::read_tsv(fileNameOut, col_names = FALSE, col_types = c("cddcdc"))
I also needed to swap score and strand here:
all_columns <- c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak")