Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active September 16, 2023 10:42
Show Gist options
  • Save crazyhottommy/f4ba546e0083abbc5f05 to your computer and use it in GitHub Desktop.
Save crazyhottommy/f4ba546e0083abbc5f05 to your computer and use it in GitHub Desktop.
## get all the promoter sequences for human hg19 genome
## Author: Ming Tang (Tommy)
## Date: 04/30/2015
## load the libraries
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19)
BSgenome.Hsapiens.UCSC.hg19
# or
Hsapiens
class(Hsapiens)
##BSgenome contains the DNA sequences
#############
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
Txdb<- TxDb.Hsapiens.UCSC.hg19.knownGene
Txdb
class(Txdb)
## Genomic features, contains the gene structure information
## all the genes, returns a GRange object
hg19_genes<- genes(Txdb)
hg19_genes
## you can also get all the exons by gene, returns a GRangelist object
exbg<- exonsBy(Txdb, by="gene")
exbg
############### from another database
library(Homo.sapiens)
ghs<- genes(Homo.sapiens)
ghs
############# hg19_genes and ghs are the same, the ENTREZID is the geneID
## get all the transcription start sites (TSSs)
tssgr<- resize( ghs, 1)
## how many bases you want to extract upstream and downstream of every TSSs
## in this case, I want the 1500 bases around TSS
upstream_pad<- 1000
downstream_pad<- 500
promoters<- promoters(tssgr, upstream_pad, downstream_pad)
head(width(promoters))
## in fact, promoters(ghs, upstream_pad, downstream_pad) gives the same result
## because the default site for calcuation is the start(ghs), which is the TSS
### get sequences for all the promoters (1500bp sequences in this case)
## it takes seconds
promoter_seq<- getSeq(Hsapiens, promoters)
## we can annotate the promoter sequences with gene names, symbols in addition to the entrez id.
annotation<- select(Homo.sapiens, key=names(promoter_seq), keytype="ENTREZID",
columns=c("SYMBOL", "ENSEMBL", "ENTREZID"))
dim(annotation)
## more rows than the promoter_seq
## Be aware that there are some duplicated rows for the same ENTREZID, because there are multiple ENSEMBL id for the
## same ENTREZID
## Let's only use the SYMBOL instead
annotation<- select(Homo.sapiens, key=names(promoter_seq), keytype="ENTREZID",
columns=c("SYMBOL", "ENTREZID"))
## Now, it is the one to one mapping
## write to a fasta file
writeXStringSet(promoter_seq, "promoters_1500.fa", append=FALSE,
compress=FALSE,compression_level=NA, format="fasta")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment