Last active
September 16, 2023 10:42
-
-
Save crazyhottommy/f4ba546e0083abbc5f05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## get all the promoter sequences for human hg19 genome | |
## Author: Ming Tang (Tommy) | |
## Date: 04/30/2015 | |
## load the libraries | |
library(GenomicRanges) | |
library(BSgenome.Hsapiens.UCSC.hg19) | |
BSgenome.Hsapiens.UCSC.hg19 | |
# or | |
Hsapiens | |
class(Hsapiens) | |
##BSgenome contains the DNA sequences | |
############# | |
library(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
Txdb<- TxDb.Hsapiens.UCSC.hg19.knownGene | |
Txdb | |
class(Txdb) | |
## Genomic features, contains the gene structure information | |
## all the genes, returns a GRange object | |
hg19_genes<- genes(Txdb) | |
hg19_genes | |
## you can also get all the exons by gene, returns a GRangelist object | |
exbg<- exonsBy(Txdb, by="gene") | |
exbg | |
############### from another database | |
library(Homo.sapiens) | |
ghs<- genes(Homo.sapiens) | |
ghs | |
############# hg19_genes and ghs are the same, the ENTREZID is the geneID | |
## get all the transcription start sites (TSSs) | |
tssgr<- resize( ghs, 1) | |
## how many bases you want to extract upstream and downstream of every TSSs | |
## in this case, I want the 1500 bases around TSS | |
upstream_pad<- 1000 | |
downstream_pad<- 500 | |
promoters<- promoters(tssgr, upstream_pad, downstream_pad) | |
head(width(promoters)) | |
## in fact, promoters(ghs, upstream_pad, downstream_pad) gives the same result | |
## because the default site for calcuation is the start(ghs), which is the TSS | |
### get sequences for all the promoters (1500bp sequences in this case) | |
## it takes seconds | |
promoter_seq<- getSeq(Hsapiens, promoters) | |
## we can annotate the promoter sequences with gene names, symbols in addition to the entrez id. | |
annotation<- select(Homo.sapiens, key=names(promoter_seq), keytype="ENTREZID", | |
columns=c("SYMBOL", "ENSEMBL", "ENTREZID")) | |
dim(annotation) | |
## more rows than the promoter_seq | |
## Be aware that there are some duplicated rows for the same ENTREZID, because there are multiple ENSEMBL id for the | |
## same ENTREZID | |
## Let's only use the SYMBOL instead | |
annotation<- select(Homo.sapiens, key=names(promoter_seq), keytype="ENTREZID", | |
columns=c("SYMBOL", "ENTREZID")) | |
## Now, it is the one to one mapping | |
## write to a fasta file | |
writeXStringSet(promoter_seq, "promoters_1500.fa", append=FALSE, | |
compress=FALSE,compression_level=NA, format="fasta") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment