Last active
September 29, 2017 19:14
-
-
Save bkutlu/3f95ee27942391119e0f651f2cfa63c8 to your computer and use it in GitHub Desktop.
Finding the longest peptide lengths genome wide
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### script to find the length of the largest peptide for all the genes | |
### example demo for Mus Musculus | |
## load the necessary packages | |
library(dplyr) | |
library(biomaRt) | |
library(org.Mm.eg.db) | |
library(Biostrings) | |
## connect to Biomart Ensembl database and select mmusculus dataset | |
ensembl <- useMart("ensembl") | |
ensembl <- useDataset("mmusculus_gene_ensembl",mart=ensembl) | |
## function that generates a data frame of ids for all the genes | |
loadEnsMusBioc <- function(){ | |
require('org.Mm.eg.db') | |
ensTab <- toTable(org.Mm.egENSEMBL) | |
symTab <- toTable(org.Mm.egSYMBOL) | |
ensMusEgSym <<- merge(ensTab,symTab) | |
print('Just base::loaded ensMusEgSym!') | |
}#loadEnsMusBioc | |
loadEnsMusBioc() | |
## get the peptide sequences from biomart | |
protein <- getSequence(id=ensMusEgSym$ensembl_id, | |
type="ensembl_gene_id", | |
seqType="peptide", | |
mart=ensembl) | |
## remove rows with unavailable sequences | |
protein %>% filter(peptide != "Sequence unavailable") -> proteinL | |
## add a column corresponding to the length of the peptide | |
proteinLW <- data.frame(proteinL, slength = width(aaS)) | |
## for multiple sequences select the longest peptide | |
proteinLW %>% | |
group_by(ensembl_gene_id) %>% | |
filter(rank(slength, ties.method = "first") == 1) -> proteinLWF | |
proteinLW %>% | |
group_by(ensembl_gene_id) %>% summarise(n()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment