davetang · May 12, 2022 06:55
diff --git a/biomart_refseq.R b/biomart_refseq.R
 #install if necessary
 source("http://bioconductor.org/biocLite.R")
 biocLite("biomaRt")

 #load library
 library("biomaRt")

 #use ensembl mart
 ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")

 #store the filters
 filters <- listFilters(ensembl)

 #look for filters with the string refseq
 grep('refseq', filters$name, ignore.case=T, value=T)
 #[1] "with_refseq_peptide"            "with_refseq_peptide_predicted"  "with_ox_refseq_mrna"           
 #[4] "with_ox_refseq_mrna_predicted"  "with_ox_refseq_ncrna"           "with_ox_refseq_ncrna_predicted"
 #[7] "refseq_mrna"                    "refseq_mrna_predicted"          "refseq_ncrna"                  
 #[10] "refseq_ncrna_predicted"         "refseq_peptide"                 "refseq_peptide_predicted"

 #store the attributes we can fetch
 attributes <- listAttributes(ensembl)

 #check out the first 10 attributes
 head(attributes,10)
 #                    name           description
 #1        ensembl_gene_id       Ensembl Gene ID
 #2  ensembl_transcript_id Ensembl Transcript ID
 #3     ensembl_peptide_id    Ensembl Protein ID
 #4        ensembl_exon_id       Ensembl Exon ID
 #5            description           Description
 #6        chromosome_name       Chromosome Name
 #7         start_position       Gene Start (bp)
 #8           end_position         Gene End (bp)
 #9                 strand                Strand
 #10                  band                  Band

 #create vector of chromosomes
 my_chr <- c(1:22,'X','Y')

 #fetch refseqs
 my_refseq <- getBM(attributes='refseq_mrna',
                   filters = 'chromosome_name',
                   values = my_chr,
                   mart = ensembl)

 #how many entries 2013 November 28th
 length(my_refseq)
 [1] 35423

 #check out the first few
 head(my_refseq)
 [1] "NM_001084392" "NM_001355"    "NR_036221"    "NM_138450"    "NM_022840"   
 [6] "NM_173505"

 #I only want entries starting with a NM (curated mRNA)
 my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
 length(my_refseq)
 [1] 33584

 #check to see if only NM
 head(my_refseq)
 [1] "NM_001084392" "NM_001355"    "NM_138450"    "NM_022840"    "NM_173505"   
 [6] "NM_033453"

 #build attribute vector
 my_attribute <- c('refseq_mrna',
                  'chromosome_name',
                  'transcript_start',
                  'transcript_end',
                  'strand')

 #fetch refseqs and their chromosomal locations
 my_refseq_loci <- getBM(attributes=my_attribute,
                        filters = c('refseq_mrna', 'chromosome_name'),
                        values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
                        mart = ensembl)
                        
 dim(my_refseq_loci)
 #[1] 33657     5

 #how many refseq ids are listed in multiple places?
 table(duplicated(my_refseq_loci$refseq_mrna))

 #FALSE  TRUE 
 #33584    73

 #get rid of the duplicated entry
 my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
 dim(my_refseq_loci)
 #[1] 33584     5

 #convert the strand into '-' and '+'
 my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
 my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)

 #add a 'chr' into the chromosome_name
 my_refseq_loci$chromosome_name <- gsub(pattern="^",
                                       replacement='chr',
                                       my_refseq_loci$chromosome_name)

 #we now have a data frame of all human mRNA refseqs and their chromosomal locations
 head(my_refseq_loci)
 #   refseq_mrna chromosome_name transcript_start transcript_end strand
 #1 NM_001084392           chr22         24313554       24316773      -
 #2    NM_001355           chr22         24313554       24322019      -
 #3    NM_138450           chr13         50202435       50208008      +
 #4    NM_022840           chr18          2538452        2571485      -
 #5    NM_033453           chr20          3190006        3204516      +
 #6 NM_001267623           chr20          3190171        3204516      +

 #I want locations of the region spanning the start of a refSeq
 span <- 2

 #store as another object
 my_refseq_tss <- my_refseq_loci

 #positive strand
 #adjust the end position first, because we need the start position in our calculations
 my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
 my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span

 #negative strand
 my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
 my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span

 head(my_refseq_tss)
 #   refseq_mrna chromosome_name transcript_start transcript_end strand
 #1 NM_001084392           chr22         24316771       24316775      -
 #2    NM_001355           chr22         24322017       24322021      -
 #3    NM_138450           chr13         50202433       50202437      +
 #4    NM_022840           chr18          2571483        2571487      -
 #5    NM_033453           chr20          3190004        3190008      +
 #6 NM_001267623           chr20          3190169        3190173      +
	#install if necessary
	source("http://bioconductor.org/biocLite.R")
	biocLite("biomaRt")

	#load library
	library("biomaRt")

	#use ensembl mart
	ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")

	#store the filters
	filters <- listFilters(ensembl)

	#look for filters with the string refseq
	grep('refseq', filters$name, ignore.case=T, value=T)
	#[1] "with_refseq_peptide" "with_refseq_peptide_predicted" "with_ox_refseq_mrna"
	#[4] "with_ox_refseq_mrna_predicted" "with_ox_refseq_ncrna" "with_ox_refseq_ncrna_predicted"
	#[7] "refseq_mrna" "refseq_mrna_predicted" "refseq_ncrna"
	#[10] "refseq_ncrna_predicted" "refseq_peptide" "refseq_peptide_predicted"

	#store the attributes we can fetch
	attributes <- listAttributes(ensembl)

	#check out the first 10 attributes
	head(attributes,10)
	# name description
	#1 ensembl_gene_id Ensembl Gene ID
	#2 ensembl_transcript_id Ensembl Transcript ID
	#3 ensembl_peptide_id Ensembl Protein ID
	#4 ensembl_exon_id Ensembl Exon ID
	#5 description Description
	#6 chromosome_name Chromosome Name
	#7 start_position Gene Start (bp)
	#8 end_position Gene End (bp)
	#9 strand Strand
	#10 band Band

	#create vector of chromosomes
	my_chr <- c(1:22,'X','Y')

	#fetch refseqs
	my_refseq <- getBM(attributes='refseq_mrna',
	filters = 'chromosome_name',
	values = my_chr,
	mart = ensembl)

	#how many entries 2013 November 28th
	length(my_refseq)
	[1] 35423

	#check out the first few
	head(my_refseq)
	[1] "NM_001084392" "NM_001355" "NR_036221" "NM_138450" "NM_022840"
	[6] "NM_173505"

	#I only want entries starting with a NM (curated mRNA)
	my_refseq <- my_refseq[grep(pattern="^NM",x=my_refseq,perl=T)]
	length(my_refseq)
	[1] 33584

	#check to see if only NM
	head(my_refseq)
	[1] "NM_001084392" "NM_001355" "NM_138450" "NM_022840" "NM_173505"
	[6] "NM_033453"

	#build attribute vector
	my_attribute <- c('refseq_mrna',
	'chromosome_name',
	'transcript_start',
	'transcript_end',
	'strand')

	#fetch refseqs and their chromosomal locations
	my_refseq_loci <- getBM(attributes=my_attribute,
	filters = c('refseq_mrna', 'chromosome_name'),
	values = list(refseq_mrna=my_refseq, chromosome_name=my_chr),
	mart = ensembl)

	dim(my_refseq_loci)
	#[1] 33657 5

	#how many refseq ids are listed in multiple places?
	table(duplicated(my_refseq_loci$refseq_mrna))

	#FALSE TRUE
	#33584 73

	#get rid of the duplicated entry
	my_refseq_loci <- my_refseq_loci[!duplicated(my_refseq_loci$refseq_mrna),]
	dim(my_refseq_loci)
	#[1] 33584 5

	#convert the strand into '-' and '+'
	my_refseq_loci$strand <- gsub(pattern='-1', replacement='-', my_refseq_loci$strand)
	my_refseq_loci$strand <- gsub(pattern='1', replacement='+', my_refseq_loci$strand)

	#add a 'chr' into the chromosome_name
	my_refseq_loci$chromosome_name <- gsub(pattern="^",
	replacement='chr',
	my_refseq_loci$chromosome_name)

	#we now have a data frame of all human mRNA refseqs and their chromosomal locations
	head(my_refseq_loci)
	# refseq_mrna chromosome_name transcript_start transcript_end strand
	#1 NM_001084392 chr22 24313554 24316773 -
	#2 NM_001355 chr22 24313554 24322019 -
	#3 NM_138450 chr13 50202435 50208008 +
	#4 NM_022840 chr18 2538452 2571485 -
	#5 NM_033453 chr20 3190006 3204516 +
	#6 NM_001267623 chr20 3190171 3204516 +

	#I want locations of the region spanning the start of a refSeq
	span <- 2

	#store as another object
	my_refseq_tss <- my_refseq_loci

	#positive strand
	#adjust the end position first, because we need the start position in our calculations
	my_refseq_tss[my_refseq_tss$strand=='+','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']+span
	my_refseq_tss[my_refseq_tss$strand=='+','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='+','transcript_start']-span

	#negative strand
	my_refseq_tss[my_refseq_tss$strand=='-','transcript_start'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']-span
	my_refseq_tss[my_refseq_tss$strand=='-','transcript_end'] <- my_refseq_tss[my_refseq_tss$strand=='-','transcript_end']+span

	head(my_refseq_tss)
	# refseq_mrna chromosome_name transcript_start transcript_end strand
	#1 NM_001084392 chr22 24316771 24316775 -
	#2 NM_001355 chr22 24322017 24322021 -
	#3 NM_138450 chr13 50202433 50202437 +
	#4 NM_022840 chr18 2571483 2571487 -
	#5 NM_033453 chr20 3190004 3190008 +
	#6 NM_001267623 chr20 3190169 3190173 +