bkutlu · September 29, 2017 19:14
diff --git a/findLongestPeptide.R b/findLongestPeptide.R
 ### script to find the length of the largest peptide for all the genes
 ### example demo for Mus Musculus

 ## load the necessary packages
 library(dplyr)
 library(biomaRt)
 library(org.Mm.eg.db)
 library(Biostrings)

 ## connect to Biomart Ensembl database and select mmusculus dataset
 ensembl <- useMart("ensembl")
 ensembl <- useDataset("mmusculus_gene_ensembl",mart=ensembl)

 ## function that generates a data frame of ids for all the genes
 loadEnsMusBioc <- function(){
  require('org.Mm.eg.db')
  ensTab <- toTable(org.Mm.egENSEMBL)
  symTab <- toTable(org.Mm.egSYMBOL)
  ensMusEgSym <<- merge(ensTab,symTab)
  print('Just base::loaded ensMusEgSym!')
 }#loadEnsMusBioc

 loadEnsMusBioc()

 ## get the peptide sequences from biomart
 protein <- getSequence(id=ensMusEgSym$ensembl_id,
                      type="ensembl_gene_id",
                      seqType="peptide",
                      mart=ensembl)

 ## remove rows with unavailable sequences
 protein %>% filter(peptide != "Sequence unavailable") -> proteinL

 ## add a column corresponding to the length of the peptide
 proteinLW <- data.frame(proteinL, slength  = width(aaS))

 ## for multiple sequences select the longest peptide
 proteinLW %>% 
  group_by(ensembl_gene_id) %>% 
  filter(rank(slength, ties.method = "first") == 1) -> proteinLWF


 proteinLW %>% 
  group_by(ensembl_gene_id) %>% summarise(n())
	### script to find the length of the largest peptide for all the genes
	### example demo for Mus Musculus

	## load the necessary packages
	library(dplyr)
	library(biomaRt)
	library(org.Mm.eg.db)
	library(Biostrings)

	## connect to Biomart Ensembl database and select mmusculus dataset
	ensembl <- useMart("ensembl")
	ensembl <- useDataset("mmusculus_gene_ensembl",mart=ensembl)

	## function that generates a data frame of ids for all the genes
	loadEnsMusBioc <- function(){
	require('org.Mm.eg.db')
	ensTab <- toTable(org.Mm.egENSEMBL)
	symTab <- toTable(org.Mm.egSYMBOL)
	ensMusEgSym <<- merge(ensTab,symTab)
	print('Just base::loaded ensMusEgSym!')
	}#loadEnsMusBioc

	loadEnsMusBioc()

	## get the peptide sequences from biomart
	protein <- getSequence(id=ensMusEgSym$ensembl_id,
	type="ensembl_gene_id",
	seqType="peptide",
	mart=ensembl)

	## remove rows with unavailable sequences
	protein %>% filter(peptide != "Sequence unavailable") -> proteinL

	## add a column corresponding to the length of the peptide
	proteinLW <- data.frame(proteinL, slength = width(aaS))

	## for multiple sequences select the longest peptide
	proteinLW %>%
	group_by(ensembl_gene_id) %>%
	filter(rank(slength, ties.method = "first") == 1) -> proteinLWF


	proteinLW %>%
	group_by(ensembl_gene_id) %>% summarise(n())