Created
July 11, 2013 17:26
-
-
Save cknoxrun/5977428 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'nokogiri' | |
| require 'open-uri' | |
| module Uniprot | |
| # Download and annotate the given uniprot_id and return an entry as an array | |
| def Uniprot.annotate_name(uniprot_id) | |
| doc = Nokogiri::XML( open("http://www.uniprot.org/uniprot/#{uniprot_id}.xml") ) | |
| doc.remove_namespaces! | |
| name_node = doc.at_xpath("//recommendedName/fullName") | |
| if name_node.blank? | |
| name_node = doc.at_xpath("//submittedName/fullName") | |
| end | |
| return nil if name_node.blank? | |
| return name_node.inner_text | |
| end | |
| # Returns a hash of results required to annotate a molecule with data from UniProt | |
| def Uniprot.annotate(uniprot_id) | |
| doc = Nokogiri::XML( open("http://www.uniprot.org/uniprot/#{uniprot_id}.xml") ) | |
| doc.remove_namespaces! | |
| results = Hash.new | |
| # Grab identifiers | |
| name_node = doc.at_xpath("//recommendedName/fullName") || doc.at_xpath("//submittedName/fullName") | |
| results[:name] = name_node.inner_text | |
| results[:uniprot_name] = doc.at_xpath("uniprot/entry/name").inner_text | |
| if n = doc.at_xpath("//gene/name[@type='primary']") | |
| results[:gene_name] = n.inner_text | |
| end | |
| # Synonyms | |
| results[:synonyms] = Array.new | |
| doc.xpath("//gene/name[@type='synonym']").each do |synonym| | |
| results[:synonyms] << synonym.inner_text | |
| end | |
| if name_node = doc.at_xpath("//recommendedName/shortName") | |
| results[:synonyms] << name_node.inner_text | |
| end | |
| if name_node = doc.at_xpath("//recommendedName/ecNumber") | |
| results[:synonyms] << name_node.inner_text | |
| end | |
| # Function | |
| if n = doc.at_xpath("//comment[@type='function']") | |
| results[:general_function] = n.inner_text.strip | |
| end | |
| # PFam | |
| results[:pfams] = Array.new | |
| doc.xpath("//dbReference[@type='Pfam']").each do |pfam| | |
| results[:pfams] << { :name => pfam.at_xpath('property[@type="entry name"]')['value'], :id => pfam['id'] } | |
| end | |
| # Protein Sequence | |
| results[:protein_sequence] = doc.at_xpath('//sequence').inner_text.strip | |
| # External identifiers | |
| if n = doc.at_xpath("//dbReference[@type='GeneCards']") | |
| results[:genecard_id] = n['id'] | |
| end | |
| if n = doc.at_xpath("//dbReference[@type='HGNC']") | |
| results[:hgnc_id] = n['id'] | |
| end | |
| # Grab organism info | |
| organism_node = doc.at_xpath('//organism') | |
| results[:uniprot_taxon_id] = organism_node.at_xpath("dbReference")['id'] | |
| results[:uniprot_taxon_db] = organism_node.at_xpath("dbReference")['type'] | |
| results[:organism_scientific_name] = organism_node.at_xpath('name[@type="scientific"]').inner_text | |
| if n = organism_node.at_xpath('name[@type="common"]') | |
| results[:species] = n.inner_text | |
| end | |
| # Grab references | |
| results[:pubmed_ids] = Array.new | |
| doc.xpath("//dbReference[@type='PubMed']").each do |pubmed| | |
| results[:pubmed_ids] << pubmed['id'] | |
| end | |
| results[:pubmed_ids].uniq! | |
| # Grab GO classifications | |
| results[:go_classes] = Array.new | |
| doc.xpath("//dbReference[@type='GO']").each do |g| | |
| go_class = g.at_xpath('property[@type="term"]')['value'] | |
| code, description = go_class.split(':') | |
| category = case code | |
| when 'F' then 'function' | |
| when 'P' then 'process' | |
| when 'C' then 'component' | |
| else raise Exception 'GO class not found' | |
| end | |
| results[:go_classes] << { :category => category, :description => description } | |
| end | |
| results | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment