-
-
Save tfuji/5537583 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby-1.9 | |
require 'rubygems' | |
require 'bio' | |
require 'json' | |
require 'securerandom' | |
# [TODO] true to combine the result with the EdgeDB | |
if $DEBUG | |
$edgedb = true | |
else | |
$edgedb = false | |
end | |
class FT_SO | |
# https://gist.github.com/raw/3650401/b97090720965752a97633e22f4a2e1565ba78d4a/ft_so.json | |
def initialize | |
@data = JSON.parse(File.read("ft_so.json")) | |
end | |
def so_id(feature) | |
if hash = @data[feature] | |
return hash["so_id"] | |
end | |
end | |
def so_term(feature) | |
if hash = @data[feature] | |
return hash["so_term"] | |
end | |
end | |
def so_desc(feature) | |
if hash = @data[feature] | |
return hash["so_desc"] | |
end | |
end | |
def ft_desc(feature) | |
if hash = @data[feature] | |
return hash["ft_desc"] | |
end | |
end | |
end | |
module Bio | |
class GenBank | |
def dblink | |
fetch('DBLINK') | |
end | |
def bioproject | |
dblink[/\d+/] | |
end | |
end | |
end | |
module RDFSupport | |
def new_uuid(prefix = "http://genome.db/uuid/") | |
#return "genome:uuid-#{SecureRandom.uuid}" or | |
return "<#{prefix}#{SecureRandom.uuid}>" | |
#return "#{prefix}:uuid-#{SecureRandom.uuid}" | |
end | |
def quote(str) | |
return "\"#{str}\"" | |
end | |
def triple(s, p, o) | |
return [s, p, o].join("\t") + " ." | |
end | |
def default_prefix | |
return [ | |
triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"), | |
triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"), | |
triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"), | |
triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"), | |
triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"), | |
] | |
end | |
def usdate2date(str) | |
return Date.parse(str).strftime("%Y-%m-%d") | |
end | |
end | |
class RefSeq2RDF | |
include RDFSupport | |
def initialize(io = ARGF, seqtype = nil, seqname = nil) | |
set_prefixes | |
set_links | |
@seqtype = seqtype || "SO:chromosome" | |
@seqname = seqname || "Chromosome" | |
@ft_so = FT_SO.new | |
puts prefix | |
puts | |
parse_refseq(io) | |
end | |
attr_accessor :prefix | |
def set_prefixes | |
@prefix = default_prefix + [ | |
triple("@prefix", "genome:", "<http://genome.db/sw/>"), | |
triple("@prefix", "genomeid:", "<http://genome.db/id/>"), | |
triple("@prefix", "genomeseq:", "<http://genome.db/seq/>"), | |
triple("@prefix", "edgedb:", "<http://purl.jp/11/edgedb/>"), | |
triple("@prefix", "faldo:", "<http://biohackathon.org/faldo/>"), | |
] | |
end | |
def set_links | |
@link = { | |
:ncbi_genome => "http://www.ncbi.nlm.nih.gov/genome/?term=", | |
:ncbi_gi => "http://www.ncbi.nlm.nih.gov/nuccore/", | |
:ncbi_refseq => "http://www.ncbi.nlm.nih.gov/nuccore/", | |
:ncbi_bioproject => "http://www.ncbi.nlm.nih.gov/bioproject/", | |
:ncbi_pubmed => "http://www.ncbi.nlm.nih.gov/pubmed/", | |
:ncbi_taxonomy => "http://www.ncbi.nlm.nih.gov/taxonomy/", | |
:ncbi_geneid => "http://www.ncbi.nlm.nih.gov/gene/", | |
:ncbi_protein => "http://www.ncbi.nlm.nih.gov/protein/", | |
} | |
end | |
def new_location(pos, elem = false) | |
loc_id = new_uuid | |
puts triple(loc_id, "genome:position", quote(pos)) | |
@locations = Bio::Locations.new(pos) | |
pos_start = new_uuid | |
pos_end = new_uuid | |
puts triple(loc_id, "faldo:start", pos_start) | |
puts triple(loc_id, "faldo:end", pos_end) | |
new_position(pos_start, @locations.range.min, @locations.first.strand) | |
new_position(pos_end, @locations.range.max, @locations.last.strand) | |
list = [] | |
if elem | |
@locations.each do |loc| | |
elem_id = new_uuid | |
elem_start = new_uuid | |
elem_end = new_uuid | |
puts triple(elem_id, "rdfs:type", elem) | |
puts triple(elem_id, "dcterms:isPartOf", loc_id) | |
puts triple(elem_id, "faldo:start", elem_start) | |
puts triple(elem_id, "faldo:end", elem_end) | |
new_position(elem_start, loc.from, loc.strand) | |
new_position(elem_end, loc.to, loc.strand) | |
list << elem_id | |
end | |
end | |
return loc_id, list | |
end | |
def new_position(pos_id, pos, strand) | |
puts triple(pos_id, "faldo:position", pos) | |
puts triple(pos_id, "faldo:reference", @sequence_id) | |
puts triple(pos_id, "rdf:type", "faldo:ExactlyKnownPosition") | |
if strand > 0 | |
puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition") | |
else | |
puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition") | |
end | |
end | |
### | |
### Main | |
### | |
def parse_refseq(io) | |
# Read RefSeq entry | |
Bio::FlatFile.auto(io).each do |entry| | |
@entry = entry | |
@features = entry.features | |
@source = @features.shift | |
parse_sequence | |
parse_source | |
parse_genes | |
parse_cds | |
parse_features | |
end | |
end | |
### | |
### Sequence | |
### | |
def parse_sequence | |
@sequence_id = new_uuid | |
# [TODO] How to identify the input is chromosome/plasmid/contig/...? | |
sequence_type(@seqtype) | |
# [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes) | |
sequence_label(@seqname) | |
sequence_comment(@entry.definition) | |
sequence_version(@entry.acc_version) | |
sequence_length(@entry.nalen) | |
# [TODO] provides REST API to retreive genomic DNA sequence by <@sequence_id.fasta> | |
sequence_seq | |
sequence_form(@entry.circular) | |
# [TODO] sequenced date, modified in the source db or in our RDF data? | |
sequence_date(@entry.date) | |
# [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs | |
sequence_link_gi(@entry.gi.sub('GI:','')) | |
sequence_link_accver(@entry.acc_version) | |
sequence_link_bioproject(@entry.bioproject) | |
# [TODO] how to deal with direct submissions (references without PMID)? | |
sequence_ref(@entry.references) | |
end | |
def sequence_type(so = "SO:chromosome") | |
case so | |
when /0000340/, "SO:chromosome" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000340") | |
# [TODO] Do we need this? | |
#puts triple(sequence_id, "genome:class", "genome:Chromosome") | |
when /0000155/, "SO:plasmid" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000155") | |
when /0000736/, "SO:organelle_sequence" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000736") | |
when /0000819/, "SO:mitochondrial_chromosome" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000819") | |
when /0000740/, "SO:plastid_sequence" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000740") | |
when /0000719/, "SO:ultracontig" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000719") | |
when /0000148/, "SO:supercontig", "SO:scaffold" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000148") | |
when /0000149/, "SO:contig" | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000149") | |
else | |
# [TODO] What to do with this? | |
puts triple(@sequence_id, "genome:class", "genome:Sequence") | |
end | |
end | |
def sequence_label(str) | |
# Use "name:" key in the JSON version | |
puts triple(@sequence_id, "rdfs:label", quote(str)) | |
end | |
def sequence_comment(str) | |
# Use "description:" key in the JSON version | |
puts triple(@sequence_id, "rdfs:comment", quote(str)) | |
end | |
def sequence_version(str) | |
# [TODO] What predicate to use? | |
puts triple(@sequence_id, "genome:version", quote(str)) | |
end | |
def sequence_length(int) | |
puts triple(@sequence_id, "genome:length", int) | |
end | |
def sequence_seq | |
#puts triple(@sequence_id, "genome:sequence", "genomeseq:#{@sequence_id}") | |
puts triple(@sequence_id, "genome:sequence", @sequence_id.sub(/>$/, '.fasta>')) | |
end | |
def sequence_form(form) | |
# [TODO] rdf:type or genome:molecularForm (which rhymes with molecularType) | |
case form | |
when "linear" | |
#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000987") # SO:linear | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") # SO:linear | |
when "circular" | |
#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000988") # SO:circular | |
puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") # SO:circular | |
end | |
end | |
def sequence_date(date) | |
puts triple(@sequence_id, "dcterms:modified", quote(usdate2date(date))+"^^xsd:date") | |
end | |
def sequence_link_gi(str) | |
uri = "<#{@link[:ncbi_gi]}#{str}>" | |
puts triple(@sequence_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:GI") if $edgedb | |
end | |
def sequence_link_accver(str) | |
uri = "<#{@link[:ncbi_refseq]}#{str}>" | |
puts triple(@sequence_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:RefSeq") if $edgedb | |
end | |
def sequence_link_bioproject(str) | |
uri = "<#{@link[:ncbi_bioproject]}#{str}>" | |
puts triple(@sequence_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:BioProject") if $edgedb | |
end | |
def sequence_ref(refs) | |
refs.each do |ref| | |
pmid = ref.pubmed | |
if pmid.length > 0 | |
uri = "<#{@link[:ncbi_pubmed]}#{pmid}>" | |
puts triple(@sequence_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:PubMed") if $edgedb | |
end | |
end | |
end | |
### | |
### Source | |
### | |
def parse_source | |
# Use @sequence_id for @source_id | |
@source_id = @sequence_id | |
hash = @source.to_hash | |
source_position(@source.position) | |
source_link(hash["db_xref"]) | |
source_organism(hash["organism"].first) | |
source_strain(hash["strain"].first) if hash["strain"] | |
source_pseudo if hash["pseudo"] | |
source_env(hash["isolation_source"] || hash["environmental_sample"]) | |
source_moltype(hash["mol_type"]) | |
["db_xref", "organism", "strain", "pseudo", "isolation_source", "environmental_sample", "mol_type"].each do |key| | |
hash.delete(key) | |
end | |
source_qualifiers(hash) | |
end | |
def source_position(pos) | |
puts triple(@source_id, "genome:location", quote(pos)) | |
range = Bio::Locations.new(pos).range | |
min = range.min | |
max = range.max | |
puts triple(@source_id, "genome:start", min) | |
puts triple(@source_id, "genome:stop", max) | |
end | |
def source_link(links) | |
links.each do |link| | |
case link | |
when /taxon:/ | |
ncbi_taxid = link[/taxon:(\d+)/, 1] | |
uri = "<#{@link[:ncbi_taxonomy]}#{ncbi_taxid}>" | |
puts triple(@source_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:Taxonomy") if $edgedb | |
else | |
# [TODO] link to other organism specific DBs (/db_xref="ATCC:29413") | |
db, entry_id = link.split(':', 2) | |
if pfx = @link[db.downcase.to_sym] | |
uri = "<#{pfx}#{entry_id}>" | |
puts triple(@source_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:#{db}") if $edgedb | |
else | |
$stderr.puts "edgedb:#{db} is missing!!" | |
puts triple(@source_id, "genome:dbxref", quote(link)) | |
end | |
end | |
end | |
end | |
def source_organism(str) | |
puts triple(@source_id, "genome:organism", quote(str)) | |
end | |
def source_strain(str) | |
puts triple(@source_id, "genome:strain", quote(str)) | |
end | |
def source_pseudo | |
puts triple(@source_id, "genome:pseudo", "true") | |
end | |
def source_env(vals) | |
# Especially for ENV division: isolation_source (environmental_sample) | |
if vals | |
if vals.length > 0 | |
env = val.to_s.gsub(/\s+/, ' ').strip | |
puts triple(@source_id, "genome:environment", quote(env)) | |
else | |
puts triple(@source_id, "genome:environment", quote("N/A")) | |
end | |
end | |
end | |
def source_moltype(vals) | |
# Especially useful for viruses | |
vals.each do |val| | |
puts triple(@source_id, "genome:molecularType", quote(val)) | |
end | |
end | |
def source_qualifiers(hash) | |
hash.each do |qual, vals| | |
vals.each do |val| | |
if val == true | |
#puts triple(@source_id, "genome:source_#{qual}", true) | |
puts triple(@source_id, "genome:feature_#{qual}", true) | |
else | |
data = val.to_s.gsub(/\s+/, ' ').strip | |
if data[/^\d+$/] | |
#puts triple(@source_id, "genome:source_#{qual}", data) | |
puts triple(@source_id, "genome:feature_#{qual}", data) | |
else | |
#puts triple(@source_id, "genome:source_#{qual}", quote(data)) | |
puts triple(@source_id, "genome:feature_#{qual}", quote(data)) | |
end | |
end | |
end | |
end | |
end | |
### | |
### genes | |
### | |
def parse_genes | |
genes = @features.select {|x| x.feature == "gene"} | |
@locus = {} | |
count = 1 | |
genes.each do |gene| | |
gene_id = new_uuid | |
hash = gene.to_hash | |
puts triple(gene_id, "rdf:type", "obo:SO_0000704") # SO:gene | |
#puts triple(gene_id, "genome:class", "genome:Gene") | |
puts triple(gene_id, "dcterms:isPartOf", @sequence_id) | |
loc_id, _ = new_location(gene.position) | |
puts triple(gene_id, "genome:location", loc_id) | |
if hash["locus_tag"] | |
locus_tag = hash["locus_tag"].first | |
@locus[locus_tag] = gene_id | |
puts triple(gene_id, "rdfs:label", quote(locus_tag)) | |
elsif hash["gene"] | |
puts triple(gene_id, "rdfs:label", quote(hash["gene"].first)) | |
else | |
# [TODO] Where else to find gene name? | |
puts triple(gene_id, "rdfs:label", quote("gene#{count}")) | |
end | |
count += 1 | |
parse_qualifiers(gene_id, hash) | |
end | |
end | |
### | |
### CDS | |
### | |
def parse_cds | |
cdss = @features.select {|x| x.feature == "CDS"} | |
count = 1 | |
cdss.each do |cds| | |
cds_id = new_uuid | |
hash = cds.to_hash | |
puts triple(cds_id, "rdf:type", "obo:SO_0000316") # SO:CDS | |
#puts triple(cds_id, "genome:class", "genome:CDS") | |
if hash["locus_tag"] | |
if locus_tag = hash["locus_tag"].first | |
gene_id = @locus[locus_tag] | |
end | |
end | |
if gene_id | |
puts triple(cds_id, "dcterms:isPartOf", gene_id) | |
else | |
# [TODO] sure to do this? | |
puts triple(cds_id, "dcterms:isPartOf", @sequence_id) | |
end | |
if locus_tag | |
puts triple(cds_id, "rdfs:label", quote(locus_tag)) | |
elsif hash["gene"] | |
puts triple(cds_id, "rdfs:label", quote(hash["gene"].first)) | |
else | |
puts triple(cds_id, "rdfs:label", quote("CDS#{count}")) | |
end | |
count += 1 | |
loc_id, exons = new_location(cds.position, "obo:SO_0000147") # SO:exon | |
puts triple(cds_id, "genome:location", loc_id) | |
puts triple(cds_id, "genome:exons", "(#{exons.join(' ')})") # rdf:List | |
parse_qualifiers(cds_id, hash) | |
end | |
end | |
### | |
### Features | |
### | |
def parse_features | |
features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]} | |
features.each do |feat| | |
feature = feat.feature | |
feature_id = new_uuid | |
hash = feat.to_hash | |
puts triple(feature_id, "dcterms:isPartOf", @sequence_id) | |
puts triple(feature_id, "rdfs:label", quote(feature)) | |
if so_id = @ft_so.so_id(feature) | |
if so_id != "undefined" | |
so = so_id.sub(':', '_') | |
# SO:#{@ft_so.so_term(feature)} | |
puts triple(feature_id, "rdf:type", "obo:#{so}") | |
else | |
# SO:sequence_feature | |
puts triple(feature_id, "rdf:type", "obo:SO_0000110") | |
end | |
end | |
loc_id, _ = new_location(feat.position) | |
puts triple(feature_id, "genome:location", loc_id) | |
parse_qualifiers(feature_id, hash) | |
end | |
end | |
def parse_qualifiers(feature_id, hash) | |
hash.each do |qual, vals| | |
vals.each do |val| | |
if val == true | |
puts triple(feature_id, "genome:feature_#{qual}", true) | |
else | |
data = val.to_s.gsub(/\s+/, ' ').strip | |
case qual | |
when "protein_id" | |
uri = "<#{@link[:ncbi_protein]}#{val}>" | |
puts triple(feature_id, "rdfs:seeAlso", uri) | |
puts triple(uri, "edgedb:database", "edgedb:Protein") if $edgedb | |
when "db_xref" | |
case val | |
when /GI:/ | |
ncbi_gi = data[/GI:(\d+)/, 1] | |
uri = "<#{@link[:ncbi_gi]}#{ncbi_gi}>" | |
puts triple(feature_id, "genome:dbxref", uri) | |
puts triple(uri, "edgedb:database", "edgedb:NcbiGI") if $edgedb | |
when /GeneID:/ | |
ncbi_geneid = data[/GeneID:(\d+)/, 1] | |
uri = "<#{@link[:ncbi_geneid]}#{ncbi_geneid}>" | |
puts triple(feature_id, "genome:dbxref", uri) | |
puts triple(uri, "edgedb:database", "edgedb:NcbiGeneID") if $edgedb | |
else | |
$stderr.puts "edgedb:#{val.sub(/:.*/,'')} is missing!!" | |
puts triple(feature_id, "genome:dbxref", quote(val)) | |
end | |
else | |
if data[/^\d+$/] | |
puts triple(feature_id, "genome:feature_#{qual}", data) | |
else | |
puts triple(feature_id, "genome:feature_#{qual}", quote(data)) | |
end | |
end | |
end | |
end | |
end | |
end | |
end | |
if __FILE__ == $0 | |
require 'getoptlong' | |
args = GetoptLong.new( | |
[ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ], | |
[ '--seqname', '-n', GetoptLong::REQUIRED_ARGUMENT ], | |
) | |
opts = {} | |
args.each_option do |name, value| | |
case name | |
when /--seqtype/ | |
opts[:seqtype] = value | |
when /--seqname/ | |
opts[:seqname] = value | |
end | |
end | |
RefSeq2RDF.new(ARGF, opts[:seqtype], opts[:seqname]) | |
end | |
# obj = FT_SO.new | |
# puts obj.so_id("-10_signal") | |
# exit | |
=begin | |
genome:Region | |
rdf:type rdfs:Class ; | |
rdfs:label "Sequence" ; | |
genome:Contig | |
rdf:type rdfs:Class ; | |
rdfs:label "Contig" ; | |
genome:Gene | |
rdf:type rdfs:Class ; | |
rdfs:label "Gene" ; | |
genome:length | |
rdf:type rdf:Property ; | |
rdfs:domain genome:Region ; | |
rdfs:range xsd:int . | |
=end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment