Last active
September 19, 2022 19:59
-
-
Save mdozmorov/7f4393f90932fb6bd911c43c20425ca0 to your computer and use it in GitHub Desktop.
T2T excluderanges download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13 | |
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist | |
# software, High Signal and Low Mappability regions. | |
# See https://github.com/dozmorovlab/excluderanges for more information. | |
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr | |
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/ | |
# bedbase_id | |
bedbase_id <- "6548a002754cc1e882035293541b59a8" | |
# Construct output file name | |
fileNameOut <- "T2T.excluderanges.bed.gz" | |
# API token for BED data | |
token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed") | |
# Download file | |
GET(url = token2, write_disk(fileNameOut, overwrite = TRUE)) | |
# Read the data in | |
T2T.excluderanges <- readr::read_tsv(fileNameOut, | |
col_names = FALSE, | |
col_types = c("cddcdc")) | |
# Assign column names depending on the number of columns | |
all_columns <- c("chr", "start", "end", "name", "score", "strand", | |
"signalValue", "pValue", "qValue", "peak") | |
colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)] | |
# Convert to GRanges object | |
T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges, | |
keep.extra.columns = TRUE) | |
# Seqinfo for T2T genome | |
chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0", | |
assembled.molecules.only = TRUE) # GCA_009914755.4 | |
chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule)) | |
# Make the same format as UCSC chromosome data | |
chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule, | |
size = chrom_data$SequenceLength, | |
assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE), | |
circular = chrom_data$circular) | |
# Keep standard chromosomes | |
chromosomes_standard <- chrom_data$chrom | |
# Subset and match to chromosomes in the T2T.excluderanges object | |
# Common chromosomes | |
chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges)) | |
# Subset T2T.excluderanges | |
T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common, | |
pruning.mode = "tidy") | |
# Subset chrom_data | |
chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ] | |
# Match objects | |
chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ] | |
# Assign seqinfo data | |
seqlengths(T2T.excluderanges) <- chrom_data$size | |
isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE) | |
genome(T2T.excluderanges) <- "T2T-CHM13v2.0" # "GCA_009914755.4" | |
T2T.excluderanges |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment