Skip to content

Instantly share code, notes, and snippets.

@ktym
Created September 12, 2012 04:30
Show Gist options
  • Save ktym/3704296 to your computer and use it in GitHub Desktop.
Save ktym/3704296 to your computer and use it in GitHub Desktop.
Convert RefSeq genome entry into RDF/Turtle using FALDO (BH12)
#!/usr/bin/env ruby-1.9
require 'rubygems'
require 'bio'
require 'json'
require 'securerandom'
# [TODO] true to combine the result with the EdgeDB
if $DEBUG
$edgedb = true
else
$edgedb = false
end
class FT_SO
# https://gist.github.com/raw/3650401/b97090720965752a97633e22f4a2e1565ba78d4a/ft_so.json
def initialize
@data = JSON.parse(File.read("ft_so.json"))
end
def so_id(feature)
if hash = @data[feature]
return hash["so_id"]
end
end
def so_term(feature)
if hash = @data[feature]
return hash["so_term"]
end
end
def so_desc(feature)
if hash = @data[feature]
return hash["so_desc"]
end
end
def ft_desc(feature)
if hash = @data[feature]
return hash["ft_desc"]
end
end
end
module Bio
class GenBank
def dblink
fetch('DBLINK')
end
def bioproject
dblink[/\d+/]
end
end
end
module RDFSupport
def new_uuid(prefix = "http://genome.db/uuid/")
#return "genome:uuid-#{SecureRandom.uuid}" or
return "<#{prefix}#{SecureRandom.uuid}>"
#return "#{prefix}:uuid-#{SecureRandom.uuid}"
end
def quote(str)
return "\"#{str}\""
end
def triple(s, p, o)
return [s, p, o].join("\t") + " ."
end
def default_prefix
return [
triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
]
end
def usdate2date(str)
return Date.parse(str).strftime("%Y-%m-%d")
end
end
class RefSeq2RDF
include RDFSupport
def initialize(io = ARGF, seqtype = nil, seqname = nil)
set_prefixes
set_links
@seqtype = seqtype || "SO:chromosome"
@seqname = seqname || "Chromosome"
@ft_so = FT_SO.new
puts prefix
puts
parse_refseq(io)
end
attr_accessor :prefix
def set_prefixes
@prefix = default_prefix + [
triple("@prefix", "genome:", "<http://genome.db/sw/>"),
triple("@prefix", "genomeid:", "<http://genome.db/id/>"),
triple("@prefix", "genomeseq:", "<http://genome.db/seq/>"),
triple("@prefix", "edgedb:", "<http://purl.jp/11/edgedb/>"),
triple("@prefix", "faldo:", "<http://biohackathon.org/faldo/>"),
]
end
def set_links
@link = {
:ncbi_genome => "http://www.ncbi.nlm.nih.gov/genome/?term=",
:ncbi_gi => "http://www.ncbi.nlm.nih.gov/nuccore/",
:ncbi_refseq => "http://www.ncbi.nlm.nih.gov/nuccore/",
:ncbi_bioproject => "http://www.ncbi.nlm.nih.gov/bioproject/",
:ncbi_pubmed => "http://www.ncbi.nlm.nih.gov/pubmed/",
:ncbi_taxonomy => "http://www.ncbi.nlm.nih.gov/taxonomy/",
:ncbi_geneid => "http://www.ncbi.nlm.nih.gov/gene/",
:ncbi_protein => "http://www.ncbi.nlm.nih.gov/protein/",
}
end
def new_location(pos, elem = false)
loc_id = new_uuid
puts triple(loc_id, "genome:position", quote(pos))
@locations = Bio::Locations.new(pos)
pos_start = new_uuid
pos_end = new_uuid
puts triple(loc_id, "faldo:start", pos_start)
puts triple(loc_id, "faldo:end", pos_end)
new_position(pos_start, @locations.range.min, @locations.first.strand)
new_position(pos_end, @locations.range.max, @locations.last.strand)
list = []
if elem
@locations.each do |loc|
elem_id = new_uuid
elem_start = new_uuid
elem_end = new_uuid
puts triple(elem_id, "rdfs:type", elem)
puts triple(elem_id, "dcterms:isPartOf", loc_id)
puts triple(elem_id, "faldo:start", elem_start)
puts triple(elem_id, "faldo:end", elem_end)
new_position(elem_start, loc.from, loc.strand)
new_position(elem_end, loc.to, loc.strand)
list << elem_id
end
end
return loc_id, list
end
def new_position(pos_id, pos, strand)
puts triple(pos_id, "faldo:position", pos)
puts triple(pos_id, "faldo:reference", @sequence_id)
puts triple(pos_id, "rdf:type", "faldo:ExactlyKnownPosition")
if strand > 0
puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
else
puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
end
end
###
### Main
###
def parse_refseq(io)
# Read RefSeq entry
Bio::FlatFile.auto(io).each do |entry|
@entry = entry
@features = entry.features
@source = @features.shift
parse_sequence
parse_source
parse_genes
parse_cds
parse_features
end
end
###
### Sequence
###
def parse_sequence
@sequence_id = new_uuid
# [TODO] How to identify the input is chromosome/plasmid/contig/...?
sequence_type(@seqtype)
# [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes)
sequence_label(@seqname)
sequence_comment(@entry.definition)
sequence_version(@entry.acc_version)
sequence_length(@entry.nalen)
# [TODO] provides REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
sequence_seq
sequence_form(@entry.circular)
# [TODO] sequenced date, modified in the source db or in our RDF data?
sequence_date(@entry.date)
# [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
sequence_link_gi(@entry.gi.sub('GI:',''))
sequence_link_accver(@entry.acc_version)
sequence_link_bioproject(@entry.bioproject)
# [TODO] how to deal with direct submissions (references without PMID)?
sequence_ref(@entry.references)
end
def sequence_type(so = "SO:chromosome")
case so
when /0000340/, "SO:chromosome"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000340")
# [TODO] Do we need this?
#puts triple(sequence_id, "genome:class", "genome:Chromosome")
when /0000155/, "SO:plasmid"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000155")
when /0000736/, "SO:organelle_sequence"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000736")
when /0000819/, "SO:mitochondrial_chromosome"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000819")
when /0000740/, "SO:plastid_sequence"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000740")
when /0000719/, "SO:ultracontig"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000719")
when /0000148/, "SO:supercontig", "SO:scaffold"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000148")
when /0000149/, "SO:contig"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000149")
else
# [TODO] What to do with this?
puts triple(@sequence_id, "genome:class", "genome:Sequence")
end
end
def sequence_label(str)
# Use "name:" key in the JSON version
puts triple(@sequence_id, "rdfs:label", quote(str))
end
def sequence_comment(str)
# Use "description:" key in the JSON version
puts triple(@sequence_id, "rdfs:comment", quote(str))
end
def sequence_version(str)
# [TODO] What predicate to use?
puts triple(@sequence_id, "genome:version", quote(str))
end
def sequence_length(int)
puts triple(@sequence_id, "genome:length", int)
end
def sequence_seq
#puts triple(@sequence_id, "genome:sequence", "genomeseq:#{@sequence_id}")
puts triple(@sequence_id, "genome:sequence", @sequence_id.sub(/>$/, '.fasta>'))
end
def sequence_form(form)
# [TODO] rdf:type or genome:molecularForm (which rhymes with molecularType)
case form
when "linear"
#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000987") # SO:linear
puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") # SO:linear
when "circular"
#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000988") # SO:circular
puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") # SO:circular
end
end
def sequence_date(date)
puts triple(@sequence_id, "dcterms:modified", quote(usdate2date(date))+"^^xsd:date")
end
def sequence_link_gi(str)
uri = "<#{@link[:ncbi_gi]}#{str}>"
puts triple(@sequence_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:GI") if $edgedb
end
def sequence_link_accver(str)
uri = "<#{@link[:ncbi_refseq]}#{str}>"
puts triple(@sequence_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:RefSeq") if $edgedb
end
def sequence_link_bioproject(str)
uri = "<#{@link[:ncbi_bioproject]}#{str}>"
puts triple(@sequence_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:BioProject") if $edgedb
end
def sequence_ref(refs)
refs.each do |ref|
pmid = ref.pubmed
if pmid.length > 0
uri = "<#{@link[:ncbi_pubmed]}#{pmid}>"
puts triple(@sequence_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:PubMed") if $edgedb
end
end
end
###
### Source
###
def parse_source
# Use @sequence_id for @source_id
@source_id = @sequence_id
hash = @source.to_hash
source_position(@source.position)
source_link(hash["db_xref"])
source_organism(hash["organism"].first)
source_strain(hash["strain"].first) if hash["strain"]
source_pseudo if hash["pseudo"]
source_env(hash["isolation_source"] || hash["environmental_sample"])
source_moltype(hash["mol_type"])
["db_xref", "organism", "strain", "pseudo", "isolation_source", "environmental_sample", "mol_type"].each do |key|
hash.delete(key)
end
source_qualifiers(hash)
end
def source_position(pos)
puts triple(@source_id, "genome:location", quote(pos))
range = Bio::Locations.new(pos).range
min = range.min
max = range.max
puts triple(@source_id, "genome:start", min)
puts triple(@source_id, "genome:stop", max)
end
def source_link(links)
links.each do |link|
case link
when /taxon:/
ncbi_taxid = link[/taxon:(\d+)/, 1]
uri = "<#{@link[:ncbi_taxonomy]}#{ncbi_taxid}>"
puts triple(@source_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:Taxonomy") if $edgedb
else
# [TODO] link to other organism specific DBs (/db_xref="ATCC:29413")
db, entry_id = link.split(':', 2)
if pfx = @link[db.downcase.to_sym]
uri = "<#{pfx}#{entry_id}>"
puts triple(@source_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:#{db}") if $edgedb
else
$stderr.puts "edgedb:#{db} is missing!!"
puts triple(@source_id, "genome:dbxref", quote(link))
end
end
end
end
def source_organism(str)
puts triple(@source_id, "genome:organism", quote(str))
end
def source_strain(str)
puts triple(@source_id, "genome:strain", quote(str))
end
def source_pseudo
puts triple(@source_id, "genome:pseudo", "true")
end
def source_env(vals)
# Especially for ENV division: isolation_source (environmental_sample)
if vals
if vals.length > 0
env = val.to_s.gsub(/\s+/, ' ').strip
puts triple(@source_id, "genome:environment", quote(env))
else
puts triple(@source_id, "genome:environment", quote("N/A"))
end
end
end
def source_moltype(vals)
# Especially useful for viruses
vals.each do |val|
puts triple(@source_id, "genome:molecularType", quote(val))
end
end
def source_qualifiers(hash)
hash.each do |qual, vals|
vals.each do |val|
if val == true
#puts triple(@source_id, "genome:source_#{qual}", true)
puts triple(@source_id, "genome:feature_#{qual}", true)
else
data = val.to_s.gsub(/\s+/, ' ').strip
if data[/^\d+$/]
#puts triple(@source_id, "genome:source_#{qual}", data)
puts triple(@source_id, "genome:feature_#{qual}", data)
else
#puts triple(@source_id, "genome:source_#{qual}", quote(data))
puts triple(@source_id, "genome:feature_#{qual}", quote(data))
end
end
end
end
end
###
### genes
###
def parse_genes
genes = @features.select {|x| x.feature == "gene"}
@locus = {}
count = 1
genes.each do |gene|
gene_id = new_uuid
hash = gene.to_hash
puts triple(gene_id, "rdf:type", "obo:SO_0000704") # SO:gene
#puts triple(gene_id, "genome:class", "genome:Gene")
puts triple(gene_id, "dcterms:isPartOf", @sequence_id)
loc_id, _ = new_location(gene.position)
puts triple(gene_id, "genome:location", loc_id)
if hash["locus_tag"]
locus_tag = hash["locus_tag"].first
@locus[locus_tag] = gene_id
puts triple(gene_id, "rdfs:label", quote(locus_tag))
elsif hash["gene"]
puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
else
# [TODO] Where else to find gene name?
puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
end
count += 1
parse_qualifiers(gene_id, hash)
end
end
###
### CDS
###
def parse_cds
cdss = @features.select {|x| x.feature == "CDS"}
count = 1
cdss.each do |cds|
cds_id = new_uuid
hash = cds.to_hash
puts triple(cds_id, "rdf:type", "obo:SO_0000316") # SO:CDS
#puts triple(cds_id, "genome:class", "genome:CDS")
if hash["locus_tag"]
if locus_tag = hash["locus_tag"].first
gene_id = @locus[locus_tag]
end
end
if gene_id
puts triple(cds_id, "dcterms:isPartOf", gene_id)
else
# [TODO] sure to do this?
puts triple(cds_id, "dcterms:isPartOf", @sequence_id)
end
if locus_tag
puts triple(cds_id, "rdfs:label", quote(locus_tag))
elsif hash["gene"]
puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
else
puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
end
count += 1
loc_id, exons = new_location(cds.position, "obo:SO_0000147") # SO:exon
puts triple(cds_id, "genome:location", loc_id)
puts triple(cds_id, "genome:exons", "(#{exons.join(' ')})") # rdf:List
parse_qualifiers(cds_id, hash)
end
end
###
### Features
###
def parse_features
features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}
features.each do |feat|
feature = feat.feature
feature_id = new_uuid
hash = feat.to_hash
puts triple(feature_id, "dcterms:isPartOf", @sequence_id)
puts triple(feature_id, "rdfs:label", quote(feature))
if so_id = @ft_so.so_id(feature)
if so_id != "undefined"
so = so_id.sub(':', '_')
# SO:#{@ft_so.so_term(feature)}
puts triple(feature_id, "rdf:type", "obo:#{so}")
else
# SO:sequence_feature
puts triple(feature_id, "rdf:type", "obo:SO_0000110")
end
end
loc_id, _ = new_location(feat.position)
puts triple(feature_id, "genome:location", loc_id)
parse_qualifiers(feature_id, hash)
end
end
def parse_qualifiers(feature_id, hash)
hash.each do |qual, vals|
vals.each do |val|
if val == true
puts triple(feature_id, "genome:feature_#{qual}", true)
else
data = val.to_s.gsub(/\s+/, ' ').strip
case qual
when "protein_id"
uri = "<#{@link[:ncbi_protein]}#{val}>"
puts triple(feature_id, "rdfs:seeAlso", uri)
puts triple(uri, "edgedb:database", "edgedb:Protein") if $edgedb
when "db_xref"
case val
when /GI:/
ncbi_gi = data[/GI:(\d+)/, 1]
uri = "<#{@link[:ncbi_gi]}#{ncbi_gi}>"
puts triple(feature_id, "genome:dbxref", uri)
puts triple(uri, "edgedb:database", "edgedb:NcbiGI") if $edgedb
when /GeneID:/
ncbi_geneid = data[/GeneID:(\d+)/, 1]
uri = "<#{@link[:ncbi_geneid]}#{ncbi_geneid}>"
puts triple(feature_id, "genome:dbxref", uri)
puts triple(uri, "edgedb:database", "edgedb:NcbiGeneID") if $edgedb
else
$stderr.puts "edgedb:#{val.sub(/:.*/,'')} is missing!!"
puts triple(feature_id, "genome:dbxref", quote(val))
end
else
if data[/^\d+$/]
puts triple(feature_id, "genome:feature_#{qual}", data)
else
puts triple(feature_id, "genome:feature_#{qual}", quote(data))
end
end
end
end
end
end
end
if __FILE__ == $0
require 'getoptlong'
args = GetoptLong.new(
[ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
[ '--seqname', '-n', GetoptLong::REQUIRED_ARGUMENT ],
)
opts = {}
args.each_option do |name, value|
case name
when /--seqtype/
opts[:seqtype] = value
when /--seqname/
opts[:seqname] = value
end
end
RefSeq2RDF.new(ARGF, opts[:seqtype], opts[:seqname])
end
# obj = FT_SO.new
# puts obj.so_id("-10_signal")
# exit
=begin
genome:Region
rdf:type rdfs:Class ;
rdfs:label "Sequence" ;
genome:Contig
rdf:type rdfs:Class ;
rdfs:label "Contig" ;
genome:Gene
rdf:type rdfs:Class ;
rdfs:label "Gene" ;
genome:length
rdf:type rdf:Property ;
rdfs:domain genome:Region ;
rdfs:range xsd:int .
=end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment