Last active
May 12, 2022 06:55
-
-
Save davetang/6562391 to your computer and use it in GitHub Desktop.
Using biomaRt to fetch all human mRNA refSeqs and their corresponding chromosome coordinates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install if necessary | |
source("http://bioconductor.org/biocLite.R") | |
biocLite("biomaRt") | |
#load library | |
library("biomaRt") | |
#use ensembl mart | |
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl") | |
#store the filters | |
filters <- listFilters(ensembl) | |
#look for filters with the string refseq | |
grep('refseq', filters$name, ignore.case=T, value=T) | |
#[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna" | |
#[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted" | |
#[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna" | |
#[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted" | |
#store the attributes we can fetch | |
attributes <- listAttributes(ensembl) | |
#check out the first 10 attributes | |
head(attributes,10) | |
# name description | |
#1 ensembl_gene_id Ensembl Gene ID | |
#2 ensembl_transcript_id Ensembl Transcript ID | |
#3 ensembl_peptide_id Ensembl Protein ID | |
#4 ensembl_exon_id Ensembl Exon ID | |
#5 description Description | |
#6 chromosome_name Chromosome Name | |
#7 start_position Gene Start (bp) | |
#8 end_position Gene End (bp) | |
#9 strand Strand | |
#10 band Band | |
#create vector of chromosomes | |
my_chr <- c(1:22,'X','Y') | |
#fetch refseqs | |
my_refseq <- getBM(attributes='refseq_mrna', | |
filters = 'chromosome_name', | |
values = my_chr, | |
mart = ensembl) | |
#how many entries 2013 November 28th | |
length(my_refseq) | |
[1] 35423 | |
#check out the first few | |
head(my_refseq) | |
[1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840" | |
[6] "NM_173505" | |
#I only want entries starting with a NM (curated mRNA) | |
my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)] | |
length(my_refseq) | |
[1] 33584 | |
#check to see if only NM | |
head(my_refseq) | |
[1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505" | |
[6] "NM_033453" | |
#build attribute vector | |
my_attribute <- c('refseq_mrna', | |
'chromosome_name', | |
'transcript_start', | |
'transcript_end', | |
'strand') | |
#fetch refseqs and their chromosomal locations | |
my_refseq_loci <- getBM(attributes=my_attribute, | |
filters = c('refseq_mrna', 'chromosome_name'), | |
values = list(refseq_mrna=my_refseq, chromosome_name=my_chr), | |
mart = ensembl) | |
dim(my_refseq_loci) | |
#[1] 33657 5 | |
#how many refseq ids are listed in multiple places? | |
table(duplicated(my_refseq_loci$refseq_mrna)) | |
#FALSE TRUE | |
#33584 73 | |
#get rid of the duplicated entry | |
my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),] | |
dim(my_refseq_loci) | |
#[1] 33584 5 | |
#convert the strand into '-' and '+' | |
my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand) | |
my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand) | |
#add a 'chr' into the chromosome_name | |
my_refseq_loci$chromosome_name <- gsub(pattern="^", | |
replacement='chr', | |
my_refseq_loci$chromosome_name) | |
#we now have a data frame of all human mRNA refseqs and their chromosomal locations | |
head(my_refseq_loci) | |
# refseq_mrna chromosome_name transcript_start transcript_end strand | |
#1 NM_001084392 chr22 24313554 24316773 - | |
#2 NM_001355 chr22 24313554 24322019 - | |
#3 NM_138450 chr13 50202435 50208008 + | |
#4 NM_022840 chr18 2538452 2571485 - | |
#5 NM_033453 chr20 3190006 3204516 + | |
#6 NM_001267623 chr20 3190171 3204516 + | |
#I want locations of the region spanning the start of a refSeq | |
span <- 2 | |
#store as another object | |
my_refseq_tss <- my_refseq_loci | |
#positive strand | |
#adjust the end position first, because we need the start position in our calculations | |
my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span | |
my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span | |
#negative strand | |
my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span | |
my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span | |
head(my_refseq_tss) | |
# refseq_mrna chromosome_name transcript_start transcript_end strand | |
#1 NM_001084392 chr22 24316771 24316775 - | |
#2 NM_001355 chr22 24322017 24322021 - | |
#3 NM_138450 chr13 50202433 50202437 + | |
#4 NM_022840 chr18 2571483 2571487 - | |
#5 NM_033453 chr20 3190004 3190008 + | |
#6 NM_001267623 chr20 3190169 3190173 + |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment