Skip to content

Instantly share code, notes, and snippets.

@gungorbudak
Created December 27, 2019 12:18
Show Gist options
  • Save gungorbudak/5f8a88c4b74a14fa034b91bd74212bb2 to your computer and use it in GitHub Desktop.
Save gungorbudak/5f8a88c4b74a14fa034b91bd74212bb2 to your computer and use it in GitHub Desktop.
Utility function to read and parse GMT file provided by Pathway Commons into R
library(stringr)
read.pc2.gmt <- function(file, organism=9606, datasource='kegg', idtype='hgnc symbol') {
# utility function to read and parse GMT file
# provided by Pathway Commons into R
# i.e. https://www.pathwaycommons.org/archives/PC2/v12/PathwayCommons12.All.hgnc.gmt.gz
gmt <- list()
con <- file(file, open='r')
for (line in readLines(con)) {
line_parts <- unlist(strsplit(line, '\t'))
desc_keys <- c('name', 'datasource', 'organism', 'idtype')
indices <- str_locate(line_parts[2], paste0(desc_keys, ': '))
desc_list <- list()
for (i in seq_along(desc_keys)) {
start <- indices[i,2]+1
stop <- ifelse(i == length(desc_keys), nchar(line_parts[2]), indices[i+1,1]-3)
desc_list[[ desc_keys[i] ]] <- substr(line_parts[2], start, stop)
}
genes <- line_parts[3:length(line_parts)]
if (desc_list$organism==9606 && desc_list$datasource==datasource && desc_list$idtype==idtype) {
gmt[[ desc_list$name ]] <- genes
}
}
close(con)
return(gmt)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment