mdozmorov · September 19, 2022 19:59
diff --git a/gist_T2T_excluderanges.R b/gist_T2T_excluderanges.R
 # Download a list of problematic regions (aka blacklist) for the T2T-CHM13 
 # telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist 
 # software, High Signal and Low Mappability regions. 
 # See https://github.com/dozmorovlab/excluderanges for more information.

 suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
 suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
 # bedbase_id
 bedbase_id <- "6548a002754cc1e882035293541b59a8"
 # Construct output file name
 fileNameOut <- "T2T.excluderanges.bed.gz"
 # API token for BED data
 token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
 # Download file
 GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
 # Read the data in
 T2T.excluderanges <- readr::read_tsv(fileNameOut, 
                                     col_names = FALSE,
                                     col_types = c("cddcdc"))
 # Assign column names depending on the number of columns
 all_columns <- c("chr", "start", "end", "name", "score", "strand",
                 "signalValue", "pValue", "qValue", "peak")
 colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)]
 # Convert to GRanges object
 T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges, 
                                              keep.extra.columns = TRUE)
 # Seqinfo for T2T genome
 chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0", 
                                                 assembled.molecules.only = TRUE) # GCA_009914755.4
 chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule))
 # Make the same format as UCSC chromosome data
 chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule,
                         size = chrom_data$SequenceLength,
                         assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE),
                         circular = chrom_data$circular)
 # Keep standard chromosomes
 chromosomes_standard <- chrom_data$chrom
 # Subset and match to chromosomes in the T2T.excluderanges object
 # Common chromosomes
 chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges))
 # Subset T2T.excluderanges
 T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common, 
                                   pruning.mode = "tidy")      
 # Subset chrom_data
 chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
 # Match objects
 chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ]
 # Assign seqinfo data
 seqlengths(T2T.excluderanges) <- chrom_data$size
 isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
 genome(T2T.excluderanges)     <- "T2T-CHM13v2.0" # "GCA_009914755.4"

 T2T.excluderanges
	# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
	# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
	# software, High Signal and Low Mappability regions.
	# See https://github.com/dozmorovlab/excluderanges for more information.

	suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
	suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
	# bedbase_id
	bedbase_id <- "6548a002754cc1e882035293541b59a8"
	# Construct output file name
	fileNameOut <- "T2T.excluderanges.bed.gz"
	# API token for BED data
	token2 <- paste0("http://bedbase.org/api/bed/", bedbase_id, "/file/bed")
	# Download file
	GET(url = token2, write_disk(fileNameOut, overwrite = TRUE))
	# Read the data in
	T2T.excluderanges <- readr::read_tsv(fileNameOut,
	col_names = FALSE,
	col_types = c("cddcdc"))
	# Assign column names depending on the number of columns
	all_columns <- c("chr", "start", "end", "name", "score", "strand",
	"signalValue", "pValue", "qValue", "peak")
	colnames(T2T.excluderanges) <- all_columns[1:ncol(T2T.excluderanges)]
	# Convert to GRanges object
	T2T.excluderanges <- makeGRangesFromDataFrame(T2T.excluderanges,
	keep.extra.columns = TRUE)
	# Seqinfo for T2T genome
	chrom_data <- GenomeInfoDb::getChromInfoFromNCBI(assembly = "T2T-CHM13v2.0",
	assembled.molecules.only = TRUE) # GCA_009914755.4
	chrom_data$AssignedMolecule <- as.character(paste0("chr", chrom_data$AssignedMolecule))
	# Make the same format as UCSC chromosome data
	chrom_data <- data.frame(chrom = chrom_data$AssignedMolecule,
	size = chrom_data$SequenceLength,
	assembled = ifelse(chrom_data$AssemblyUnit == "Primary Assembly", TRUE, FALSE),
	circular = chrom_data$circular)
	# Keep standard chromosomes
	chromosomes_standard <- chrom_data$chrom
	# Subset and match to chromosomes in the T2T.excluderanges object
	# Common chromosomes
	chromosomes_common <- intersect(chrom_data$chrom, seqlevels(T2T.excluderanges))
	# Subset T2T.excluderanges
	T2T.excluderanges <- keepSeqlevels(T2T.excluderanges, chromosomes_common,
	pruning.mode = "tidy")
	# Subset chrom_data
	chrom_data <- chrom_data[chrom_data$chrom %in% chromosomes_common, ]
	# Match objects
	chrom_data <- chrom_data[match(seqlevels(T2T.excluderanges), chrom_data$chrom), ]
	# Assign seqinfo data
	seqlengths(T2T.excluderanges) <- chrom_data$size
	isCircular(T2T.excluderanges) <- ifelse(is.na(chrom_data$circular), FALSE, TRUE)
	genome(T2T.excluderanges) <- "T2T-CHM13v2.0" # "GCA_009914755.4"

	T2T.excluderanges