ktym · September 12, 2012 04:30
diff --git a/refseq2rdf.rb b/refseq2rdf.rb
 #!/usr/bin/env ruby-1.9

 require 'rubygems'
 require 'bio'
 require 'json'
 require 'securerandom'

 # [TODO] true to combine the result with the EdgeDB
 if $DEBUG
  $edgedb = true
 else
  $edgedb = false
 end

 class FT_SO
  # https://gist.github.com/raw/3650401/b97090720965752a97633e22f4a2e1565ba78d4a/ft_so.json
  def initialize
    @data = JSON.parse(File.read("ft_so.json"))
  end

  def so_id(feature)
    if hash = @data[feature]
      return hash["so_id"]
    end
  end

  def so_term(feature)
    if hash = @data[feature]
      return hash["so_term"]
    end
  end

  def so_desc(feature)
    if hash = @data[feature]
      return hash["so_desc"]
    end
  end

  def ft_desc(feature)
    if hash = @data[feature]
      return hash["ft_desc"]
    end
  end
 end

 module Bio
  class GenBank
    def dblink
      fetch('DBLINK')
    end

    def bioproject
      dblink[/\d+/]
    end
  end
 end

 module RDFSupport
  def new_uuid(prefix = "http://genome.db/uuid/")
    #return "genome:uuid-#{SecureRandom.uuid}" or
    return "<#{prefix}#{SecureRandom.uuid}>"
    #return "#{prefix}:uuid-#{SecureRandom.uuid}"
  end

  def quote(str)
    return "\"#{str}\""
  end

  def triple(s, p, o)
    return [s, p, o].join("\t") + " ."
  end

  def default_prefix
    return [
      triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
      triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
      triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
      triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
      triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
    ]
  end

  def usdate2date(str)
    return Date.parse(str).strftime("%Y-%m-%d")  
  end
 end


 class RefSeq2RDF

  include RDFSupport

  def initialize(io = ARGF, seqtype = nil, seqname = nil)
    set_prefixes
    set_links

    @seqtype = seqtype || "SO:chromosome"
    @seqname = seqname || "Chromosome"
    @ft_so = FT_SO.new

    puts prefix
    puts

    parse_refseq(io)
  end

  attr_accessor :prefix

  def set_prefixes
    @prefix = default_prefix + [
      triple("@prefix", "genome:", "<http://genome.db/sw/>"),
      triple("@prefix", "genomeid:", "<http://genome.db/id/>"),
      triple("@prefix", "genomeseq:", "<http://genome.db/seq/>"),
      triple("@prefix", "edgedb:", "<http://purl.jp/11/edgedb/>"),
      triple("@prefix", "faldo:", "<http://biohackathon.org/faldo/>"),
    ]
  end

  def set_links
    @link = {
      :ncbi_genome => "http://www.ncbi.nlm.nih.gov/genome/?term=",
      :ncbi_gi => "http://www.ncbi.nlm.nih.gov/nuccore/",
      :ncbi_refseq => "http://www.ncbi.nlm.nih.gov/nuccore/",
      :ncbi_bioproject => "http://www.ncbi.nlm.nih.gov/bioproject/",
      :ncbi_pubmed => "http://www.ncbi.nlm.nih.gov/pubmed/",
      :ncbi_taxonomy => "http://www.ncbi.nlm.nih.gov/taxonomy/",
      :ncbi_geneid => "http://www.ncbi.nlm.nih.gov/gene/",
      :ncbi_protein => "http://www.ncbi.nlm.nih.gov/protein/",
    }
  end

  def new_location(pos, elem = false)
    loc_id = new_uuid

    puts triple(loc_id, "genome:position", quote(pos))

    @locations = Bio::Locations.new(pos)
    pos_start = new_uuid
    pos_end = new_uuid
    puts triple(loc_id, "faldo:start", pos_start)
    puts triple(loc_id, "faldo:end", pos_end)
    new_position(pos_start, @locations.range.min, @locations.first.strand)
    new_position(pos_end, @locations.range.max, @locations.last.strand)

    list = []
    if elem
      @locations.each do |loc|
        elem_id = new_uuid
        elem_start = new_uuid
        elem_end = new_uuid
        puts triple(elem_id, "rdfs:type", elem)
        puts triple(elem_id, "dcterms:isPartOf", loc_id)
        puts triple(elem_id, "faldo:start", elem_start)
        puts triple(elem_id, "faldo:end", elem_end)
        new_position(elem_start, loc.from, loc.strand)
        new_position(elem_end, loc.to, loc.strand)
        list << elem_id
      end
    end

    return loc_id, list
  end

  def new_position(pos_id, pos, strand)
    puts triple(pos_id, "faldo:position", pos)
    puts triple(pos_id, "faldo:reference", @sequence_id)
    puts triple(pos_id, "rdf:type", "faldo:ExactlyKnownPosition")
    if strand > 0
      puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
    else
      puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
    end
  end

  ###
  ### Main
  ###

  def parse_refseq(io)
    # Read RefSeq entry
    Bio::FlatFile.auto(io).each do |entry|
      @entry = entry
      @features = entry.features
      @source = @features.shift
      parse_sequence
      parse_source
      parse_genes
      parse_cds
      parse_features
    end
  end

  ###
  ### Sequence
  ###

  def parse_sequence
    @sequence_id = new_uuid

    # [TODO] How to identify the input is chromosome/plasmid/contig/...?
    sequence_type(@seqtype)
    # [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes)
    sequence_label(@seqname)
    sequence_comment(@entry.definition)
    sequence_version(@entry.acc_version)
    sequence_length(@entry.nalen)
    # [TODO] provides REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
    sequence_seq
    sequence_form(@entry.circular)
    # [TODO] sequenced date, modified in the source db or in our RDF data?
    sequence_date(@entry.date)
    # [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
    sequence_link_gi(@entry.gi.sub('GI:',''))
    sequence_link_accver(@entry.acc_version)
    sequence_link_bioproject(@entry.bioproject)
    # [TODO] how to deal with direct submissions (references without PMID)?
    sequence_ref(@entry.references)
  end

  def sequence_type(so = "SO:chromosome")
    case so
    when /0000340/, "SO:chromosome"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000340")
      # [TODO] Do we need this?
      #puts triple(sequence_id, "genome:class", "genome:Chromosome")
    when /0000155/, "SO:plasmid"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000155")
    when /0000736/, "SO:organelle_sequence"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000736")
    when /0000819/, "SO:mitochondrial_chromosome"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000819")
    when /0000740/, "SO:plastid_sequence"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000740")
    when /0000719/, "SO:ultracontig"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000719")
    when /0000148/, "SO:supercontig", "SO:scaffold"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000148")
    when /0000149/, "SO:contig"
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000149")
    else
      # [TODO] What to do with this?
      puts triple(@sequence_id, "genome:class", "genome:Sequence")
    end
  end

  def sequence_label(str)
    # Use "name:" key in the JSON version
    puts triple(@sequence_id, "rdfs:label", quote(str))
  end

  def sequence_comment(str)
    # Use "description:" key in the JSON version
    puts triple(@sequence_id, "rdfs:comment", quote(str))
  end

  def sequence_version(str)
    # [TODO] What predicate to use?
    puts triple(@sequence_id, "genome:version", quote(str))
  end

  def sequence_length(int)
    puts triple(@sequence_id, "genome:length", int)
  end

  def sequence_seq
    #puts triple(@sequence_id, "genome:sequence", "genomeseq:#{@sequence_id}")
    puts triple(@sequence_id, "genome:sequence", @sequence_id.sub(/>$/, '.fasta>'))
  end

  def sequence_form(form)
    # [TODO] rdf:type or genome:molecularForm (which rhymes with molecularType)
    case form
    when "linear"
      #puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000987") # SO:linear
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") # SO:linear
    when "circular"
      #puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000988") # SO:circular
      puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") # SO:circular
    end
  end

  def sequence_date(date)
    puts triple(@sequence_id, "dcterms:modified", quote(usdate2date(date))+"^^xsd:date")
  end

  def sequence_link_gi(str)
    uri = "<#{@link[:ncbi_gi]}#{str}>"
    puts triple(@sequence_id, "rdfs:seeAlso", uri)
    puts triple(uri, "edgedb:database", "edgedb:GI") if $edgedb
  end

  def sequence_link_accver(str)
    uri = "<#{@link[:ncbi_refseq]}#{str}>"
    puts triple(@sequence_id, "rdfs:seeAlso", uri)
    puts triple(uri, "edgedb:database", "edgedb:RefSeq") if $edgedb
  end

  def sequence_link_bioproject(str)
    uri = "<#{@link[:ncbi_bioproject]}#{str}>"
    puts triple(@sequence_id, "rdfs:seeAlso", uri)
    puts triple(uri, "edgedb:database", "edgedb:BioProject") if $edgedb
  end

  def sequence_ref(refs)
    refs.each do |ref|
      pmid = ref.pubmed
      if pmid.length > 0
        uri = "<#{@link[:ncbi_pubmed]}#{pmid}>"
        puts triple(@sequence_id, "rdfs:seeAlso", uri)
        puts triple(uri, "edgedb:database", "edgedb:PubMed") if $edgedb
      end
    end
  end

  ###
  ### Source
  ###

  def parse_source
    # Use @sequence_id for @source_id
    @source_id = @sequence_id

    hash = @source.to_hash
    source_position(@source.position)
    source_link(hash["db_xref"])
    source_organism(hash["organism"].first)
    source_strain(hash["strain"].first) if hash["strain"]
    source_pseudo if hash["pseudo"]
    source_env(hash["isolation_source"] || hash["environmental_sample"])
    source_moltype(hash["mol_type"])
    ["db_xref", "organism", "strain", "pseudo", "isolation_source", "environmental_sample", "mol_type"].each do |key|
      hash.delete(key)
    end
    source_qualifiers(hash)
  end

  def source_position(pos)
    puts triple(@source_id, "genome:location", quote(pos))
    range = Bio::Locations.new(pos).range
    min = range.min
    max = range.max
    puts triple(@source_id, "genome:start", min)
    puts triple(@source_id, "genome:stop", max)
  end

  def source_link(links)
    links.each do |link|
      case link
      when /taxon:/
        ncbi_taxid = link[/taxon:(\d+)/, 1]
        uri = "<#{@link[:ncbi_taxonomy]}#{ncbi_taxid}>"
        puts triple(@source_id, "rdfs:seeAlso", uri)
        puts triple(uri, "edgedb:database", "edgedb:Taxonomy") if $edgedb
      else
        # [TODO] link to other organism specific DBs (/db_xref="ATCC:29413")
        db, entry_id = link.split(':', 2)
        if pfx = @link[db.downcase.to_sym]
          uri = "<#{pfx}#{entry_id}>"
          puts triple(@source_id, "rdfs:seeAlso", uri)
          puts triple(uri, "edgedb:database", "edgedb:#{db}") if $edgedb
        else
          $stderr.puts "edgedb:#{db} is missing!!"
          puts triple(@source_id, "genome:dbxref", quote(link))
        end
      end
    end
  end

  def source_organism(str)
    puts triple(@source_id, "genome:organism", quote(str))
  end

  def source_strain(str)
    puts triple(@source_id, "genome:strain", quote(str))
  end

  def source_pseudo
    puts triple(@source_id, "genome:pseudo", "true")
  end

  def source_env(vals)
    # Especially for ENV division: isolation_source (environmental_sample)
    if vals
      if vals.length > 0
        env = val.to_s.gsub(/\s+/, ' ').strip
        puts triple(@source_id, "genome:environment", quote(env))
      else
        puts triple(@source_id, "genome:environment", quote("N/A"))
      end
    end
  end

  def source_moltype(vals)
    # Especially useful for viruses
    vals.each do |val|
      puts triple(@source_id, "genome:molecularType", quote(val))
    end
  end

  def source_qualifiers(hash)
    hash.each do |qual, vals|
      vals.each do |val|
        if val == true
          #puts triple(@source_id, "genome:source_#{qual}", true)
          puts triple(@source_id, "genome:feature_#{qual}", true)
        else        
          data = val.to_s.gsub(/\s+/, ' ').strip
          if data[/^\d+$/]
            #puts triple(@source_id, "genome:source_#{qual}", data)
            puts triple(@source_id, "genome:feature_#{qual}", data)
          else
            #puts triple(@source_id, "genome:source_#{qual}", quote(data))
            puts triple(@source_id, "genome:feature_#{qual}", quote(data))
          end
        end
      end
    end
  end

  ###
  ### genes
  ###

  def parse_genes
    genes = @features.select {|x| x.feature == "gene"}
  
    @locus = {}
    count = 1
    genes.each do |gene|
      gene_id = new_uuid
      hash = gene.to_hash

      puts triple(gene_id, "rdf:type", "obo:SO_0000704")  # SO:gene
      #puts triple(gene_id, "genome:class", "genome:Gene")

      puts triple(gene_id, "dcterms:isPartOf", @sequence_id)

      loc_id, _ = new_location(gene.position)
      puts triple(gene_id, "genome:location", loc_id)

      if hash["locus_tag"]
        locus_tag = hash["locus_tag"].first
        @locus[locus_tag] = gene_id
        puts triple(gene_id, "rdfs:label", quote(locus_tag))
      elsif hash["gene"]
        puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
      else
        # [TODO] Where else to find gene name?
        puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
      end
      count += 1

      parse_qualifiers(gene_id, hash)
    end
  end

  ###
  ### CDS
  ###

  def parse_cds
    cdss = @features.select {|x| x.feature == "CDS"}

    count = 1
    cdss.each do |cds|
      cds_id = new_uuid
      hash = cds.to_hash

      puts triple(cds_id, "rdf:type", "obo:SO_0000316")  # SO:CDS
      #puts triple(cds_id, "genome:class", "genome:CDS")

      if hash["locus_tag"]
        if locus_tag = hash["locus_tag"].first
          gene_id = @locus[locus_tag]
        end
      end

      if gene_id
        puts triple(cds_id, "dcterms:isPartOf", gene_id)
      else
        # [TODO] sure to do this?
        puts triple(cds_id, "dcterms:isPartOf", @sequence_id)
      end

      if locus_tag
        puts triple(cds_id, "rdfs:label", quote(locus_tag))
      elsif hash["gene"]
        puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
      else
        puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
      end
      count += 1

      loc_id, exons = new_location(cds.position, "obo:SO_0000147") # SO:exon
      puts triple(cds_id, "genome:location", loc_id)
      puts triple(cds_id, "genome:exons", "(#{exons.join(' ')})") # rdf:List

      parse_qualifiers(cds_id, hash)
    end
  end

  ###
  ### Features
  ###

  def parse_features
    features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}

    features.each do |feat|
      feature = feat.feature
      feature_id = new_uuid
      hash = feat.to_hash

      puts triple(feature_id, "dcterms:isPartOf", @sequence_id)
      puts triple(feature_id, "rdfs:label", quote(feature))

      if so_id = @ft_so.so_id(feature)
        if so_id != "undefined"
          so = so_id.sub(':', '_')
          # SO:#{@ft_so.so_term(feature)}
          puts triple(feature_id, "rdf:type", "obo:#{so}")
        else
          # SO:sequence_feature
          puts triple(feature_id, "rdf:type", "obo:SO_0000110")
        end
      end

      loc_id, _ = new_location(feat.position)
      puts triple(feature_id, "genome:location", loc_id)

      parse_qualifiers(feature_id, hash)
    end
  end

  def parse_qualifiers(feature_id, hash)
    hash.each do |qual, vals|
      vals.each do |val|
        if val == true
          puts triple(feature_id, "genome:feature_#{qual}", true)
        else
          data = val.to_s.gsub(/\s+/, ' ').strip
          case qual
          when "protein_id"
            uri = "<#{@link[:ncbi_protein]}#{val}>"
            puts triple(feature_id, "rdfs:seeAlso", uri)
            puts triple(uri, "edgedb:database", "edgedb:Protein") if $edgedb
          when "db_xref"
            case val
            when /GI:/
              ncbi_gi = data[/GI:(\d+)/, 1]
              uri = "<#{@link[:ncbi_gi]}#{ncbi_gi}>"
              puts triple(feature_id, "genome:dbxref", uri)
              puts triple(uri, "edgedb:database", "edgedb:NcbiGI") if $edgedb
            when /GeneID:/
              ncbi_geneid = data[/GeneID:(\d+)/, 1]
              uri = "<#{@link[:ncbi_geneid]}#{ncbi_geneid}>"
              puts triple(feature_id, "genome:dbxref", uri)
              puts triple(uri, "edgedb:database", "edgedb:NcbiGeneID") if $edgedb
            else
              $stderr.puts "edgedb:#{val.sub(/:.*/,'')} is missing!!"
              puts triple(feature_id, "genome:dbxref", quote(val))
            end
          else
            if data[/^\d+$/]
              puts triple(feature_id, "genome:feature_#{qual}", data)
            else
              puts triple(feature_id, "genome:feature_#{qual}", quote(data))
            end
          end
        end
      end
    end
  end

 end


 if __FILE__ == $0
  require 'getoptlong'

  args = GetoptLong.new(
    [ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
    [ '--seqname', '-n', GetoptLong::REQUIRED_ARGUMENT ],
  )

  opts = {}

  args.each_option do |name, value|
    case name
    when /--seqtype/
      opts[:seqtype] = value
    when /--seqname/
      opts[:seqname] = value
    end
  end

  RefSeq2RDF.new(ARGF, opts[:seqtype], opts[:seqname])
 end

 # obj = FT_SO.new
 # puts obj.so_id("-10_signal")
 # exit

 =begin
 genome:Region
  rdf:type rdfs:Class ;
  rdfs:label "Sequence" ;

 genome:Contig
  rdf:type rdfs:Class ;
  rdfs:label "Contig" ;

 genome:Gene
  rdf:type rdfs:Class ;
  rdfs:label "Gene" ;

 genome:length
  rdf:type rdf:Property ;
  rdfs:domain genome:Region ;
  rdfs:range xsd:int .
 =end
	#!/usr/bin/env ruby-1.9

	require 'rubygems'
	require 'bio'
	require 'json'
	require 'securerandom'

	# [TODO] true to combine the result with the EdgeDB
	if $DEBUG
	$edgedb = true
	else
	$edgedb = false
	end

	class FT_SO
	# https://gist.github.com/raw/3650401/b97090720965752a97633e22f4a2e1565ba78d4a/ft_so.json
	def initialize
	@data = JSON.parse(File.read("ft_so.json"))
	end

	def so_id(feature)
	if hash = @data[feature]
	return hash["so_id"]
	end
	end

	def so_term(feature)
	if hash = @data[feature]
	return hash["so_term"]
	end
	end

	def so_desc(feature)
	if hash = @data[feature]
	return hash["so_desc"]
	end
	end

	def ft_desc(feature)
	if hash = @data[feature]
	return hash["ft_desc"]
	end
	end
	end

	module Bio
	class GenBank
	def dblink
	fetch('DBLINK')
	end

	def bioproject
	dblink[/\d+/]
	end
	end
	end

	module RDFSupport
	def new_uuid(prefix = "http://genome.db/uuid/")
	#return "genome:uuid-#{SecureRandom.uuid}" or
	return "<#{prefix}#{SecureRandom.uuid}>"
	#return "#{prefix}:uuid-#{SecureRandom.uuid}"
	end

	def quote(str)
	return "\"#{str}\""
	end

	def triple(s, p, o)
	return [s, p, o].join("\t") + " ."
	end

	def default_prefix
	return [
	triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
	triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
	triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
	triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
	triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
	]
	end

	def usdate2date(str)
	return Date.parse(str).strftime("%Y-%m-%d")
	end
	end


	class RefSeq2RDF

	include RDFSupport

	def initialize(io = ARGF, seqtype = nil, seqname = nil)
	set_prefixes
	set_links

	@seqtype = seqtype \|\| "SO:chromosome"
	@seqname = seqname \|\| "Chromosome"
	@ft_so = FT_SO.new

	puts prefix
	puts

	parse_refseq(io)
	end

	attr_accessor :prefix

	def set_prefixes
	@prefix = default_prefix + [
	triple("@prefix", "genome:", "<http://genome.db/sw/>"),
	triple("@prefix", "genomeid:", "<http://genome.db/id/>"),
	triple("@prefix", "genomeseq:", "<http://genome.db/seq/>"),
	triple("@prefix", "edgedb:", "<http://purl.jp/11/edgedb/>"),
	triple("@prefix", "faldo:", "<http://biohackathon.org/faldo/>"),
	]
	end

	def set_links
	@link = {
	:ncbi_genome => "http://www.ncbi.nlm.nih.gov/genome/?term=",
	:ncbi_gi => "http://www.ncbi.nlm.nih.gov/nuccore/",
	:ncbi_refseq => "http://www.ncbi.nlm.nih.gov/nuccore/",
	:ncbi_bioproject => "http://www.ncbi.nlm.nih.gov/bioproject/",
	:ncbi_pubmed => "http://www.ncbi.nlm.nih.gov/pubmed/",
	:ncbi_taxonomy => "http://www.ncbi.nlm.nih.gov/taxonomy/",
	:ncbi_geneid => "http://www.ncbi.nlm.nih.gov/gene/",
	:ncbi_protein => "http://www.ncbi.nlm.nih.gov/protein/",
	}
	end

	def new_location(pos, elem = false)
	loc_id = new_uuid

	puts triple(loc_id, "genome:position", quote(pos))

	@locations = Bio::Locations.new(pos)
	pos_start = new_uuid
	pos_end = new_uuid
	puts triple(loc_id, "faldo:start", pos_start)
	puts triple(loc_id, "faldo:end", pos_end)
	new_position(pos_start, @locations.range.min, @locations.first.strand)
	new_position(pos_end, @locations.range.max, @locations.last.strand)

	list = []
	if elem
	@locations.each do \|loc\|
	elem_id = new_uuid
	elem_start = new_uuid
	elem_end = new_uuid
	puts triple(elem_id, "rdfs:type", elem)
	puts triple(elem_id, "dcterms:isPartOf", loc_id)
	puts triple(elem_id, "faldo:start", elem_start)
	puts triple(elem_id, "faldo:end", elem_end)
	new_position(elem_start, loc.from, loc.strand)
	new_position(elem_end, loc.to, loc.strand)
	list << elem_id
	end
	end

	return loc_id, list
	end

	def new_position(pos_id, pos, strand)
	puts triple(pos_id, "faldo:position", pos)
	puts triple(pos_id, "faldo:reference", @sequence_id)
	puts triple(pos_id, "rdf:type", "faldo:ExactlyKnownPosition")
	if strand > 0
	puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
	else
	puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
	end
	end

	###
	### Main
	###

	def parse_refseq(io)
	# Read RefSeq entry
	Bio::FlatFile.auto(io).each do \|entry\|
	@entry = entry
	@features = entry.features
	@source = @features.shift
	parse_sequence
	parse_source
	parse_genes
	parse_cds
	parse_features
	end
	end

	###
	### Sequence
	###

	def parse_sequence
	@sequence_id = new_uuid

	# [TODO] How to identify the input is chromosome/plasmid/contig/...?
	sequence_type(@seqtype)
	# [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes)
	sequence_label(@seqname)
	sequence_comment(@entry.definition)
	sequence_version(@entry.acc_version)
	sequence_length(@entry.nalen)
	# [TODO] provides REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
	sequence_seq
	sequence_form(@entry.circular)
	# [TODO] sequenced date, modified in the source db or in our RDF data?
	sequence_date(@entry.date)
	# [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
	sequence_link_gi(@entry.gi.sub('GI:',''))
	sequence_link_accver(@entry.acc_version)
	sequence_link_bioproject(@entry.bioproject)
	# [TODO] how to deal with direct submissions (references without PMID)?
	sequence_ref(@entry.references)
	end

	def sequence_type(so = "SO:chromosome")
	case so
	when /0000340/, "SO:chromosome"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000340")
	# [TODO] Do we need this?
	#puts triple(sequence_id, "genome:class", "genome:Chromosome")
	when /0000155/, "SO:plasmid"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000155")
	when /0000736/, "SO:organelle_sequence"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000736")
	when /0000819/, "SO:mitochondrial_chromosome"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000819")
	when /0000740/, "SO:plastid_sequence"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000740")
	when /0000719/, "SO:ultracontig"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000719")
	when /0000148/, "SO:supercontig", "SO:scaffold"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000148")
	when /0000149/, "SO:contig"
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000149")
	else
	# [TODO] What to do with this?
	puts triple(@sequence_id, "genome:class", "genome:Sequence")
	end
	end

	def sequence_label(str)
	# Use "name:" key in the JSON version
	puts triple(@sequence_id, "rdfs:label", quote(str))
	end

	def sequence_comment(str)
	# Use "description:" key in the JSON version
	puts triple(@sequence_id, "rdfs:comment", quote(str))
	end

	def sequence_version(str)
	# [TODO] What predicate to use?
	puts triple(@sequence_id, "genome:version", quote(str))
	end

	def sequence_length(int)
	puts triple(@sequence_id, "genome:length", int)
	end

	def sequence_seq
	#puts triple(@sequence_id, "genome:sequence", "genomeseq:#{@sequence_id}")
	puts triple(@sequence_id, "genome:sequence", @sequence_id.sub(/>$/, '.fasta>'))
	end

	def sequence_form(form)
	# [TODO] rdf:type or genome:molecularForm (which rhymes with molecularType)
	case form
	when "linear"
	#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000987") # SO:linear
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") # SO:linear
	when "circular"
	#puts triple(@sequence_id, "genome:molecularForm", "obo:SO_0000988") # SO:circular
	puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") # SO:circular
	end
	end

	def sequence_date(date)
	puts triple(@sequence_id, "dcterms:modified", quote(usdate2date(date))+"^^xsd:date")
	end

	def sequence_link_gi(str)
	uri = "<#{@link[:ncbi_gi]}#{str}>"
	puts triple(@sequence_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:GI") if $edgedb
	end

	def sequence_link_accver(str)
	uri = "<#{@link[:ncbi_refseq]}#{str}>"
	puts triple(@sequence_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:RefSeq") if $edgedb
	end

	def sequence_link_bioproject(str)
	uri = "<#{@link[:ncbi_bioproject]}#{str}>"
	puts triple(@sequence_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:BioProject") if $edgedb
	end

	def sequence_ref(refs)
	refs.each do \|ref\|
	pmid = ref.pubmed
	if pmid.length > 0
	uri = "<#{@link[:ncbi_pubmed]}#{pmid}>"
	puts triple(@sequence_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:PubMed") if $edgedb
	end
	end
	end

	###
	### Source
	###

	def parse_source
	# Use @sequence_id for @source_id
	@source_id = @sequence_id

	hash = @source.to_hash
	source_position(@source.position)
	source_link(hash["db_xref"])
	source_organism(hash["organism"].first)
	source_strain(hash["strain"].first) if hash["strain"]
	source_pseudo if hash["pseudo"]
	source_env(hash["isolation_source"] \|\| hash["environmental_sample"])
	source_moltype(hash["mol_type"])
	["db_xref", "organism", "strain", "pseudo", "isolation_source", "environmental_sample", "mol_type"].each do \|key\|
	hash.delete(key)
	end
	source_qualifiers(hash)
	end

	def source_position(pos)
	puts triple(@source_id, "genome:location", quote(pos))
	range = Bio::Locations.new(pos).range
	min = range.min
	max = range.max
	puts triple(@source_id, "genome:start", min)
	puts triple(@source_id, "genome:stop", max)
	end

	def source_link(links)
	links.each do \|link\|
	case link
	when /taxon:/
	ncbi_taxid = link[/taxon:(\d+)/, 1]
	uri = "<#{@link[:ncbi_taxonomy]}#{ncbi_taxid}>"
	puts triple(@source_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:Taxonomy") if $edgedb
	else
	# [TODO] link to other organism specific DBs (/db_xref="ATCC:29413")
	db, entry_id = link.split(':', 2)
	if pfx = @link[db.downcase.to_sym]
	uri = "<#{pfx}#{entry_id}>"
	puts triple(@source_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:#{db}") if $edgedb
	else
	$stderr.puts "edgedb:#{db} is missing!!"
	puts triple(@source_id, "genome:dbxref", quote(link))
	end
	end
	end
	end

	def source_organism(str)
	puts triple(@source_id, "genome:organism", quote(str))
	end

	def source_strain(str)
	puts triple(@source_id, "genome:strain", quote(str))
	end

	def source_pseudo
	puts triple(@source_id, "genome:pseudo", "true")
	end

	def source_env(vals)
	# Especially for ENV division: isolation_source (environmental_sample)
	if vals
	if vals.length > 0
	env = val.to_s.gsub(/\s+/, ' ').strip
	puts triple(@source_id, "genome:environment", quote(env))
	else
	puts triple(@source_id, "genome:environment", quote("N/A"))
	end
	end
	end

	def source_moltype(vals)
	# Especially useful for viruses
	vals.each do \|val\|
	puts triple(@source_id, "genome:molecularType", quote(val))
	end
	end

	def source_qualifiers(hash)
	hash.each do \|qual, vals\|
	vals.each do \|val\|
	if val == true
	#puts triple(@source_id, "genome:source_#{qual}", true)
	puts triple(@source_id, "genome:feature_#{qual}", true)
	else
	data = val.to_s.gsub(/\s+/, ' ').strip
	if data[/^\d+$/]
	#puts triple(@source_id, "genome:source_#{qual}", data)
	puts triple(@source_id, "genome:feature_#{qual}", data)
	else
	#puts triple(@source_id, "genome:source_#{qual}", quote(data))
	puts triple(@source_id, "genome:feature_#{qual}", quote(data))
	end
	end
	end
	end
	end

	###
	### genes
	###

	def parse_genes
	genes = @features.select {\|x\| x.feature == "gene"}

	@locus = {}
	count = 1
	genes.each do \|gene\|
	gene_id = new_uuid
	hash = gene.to_hash

	puts triple(gene_id, "rdf:type", "obo:SO_0000704") # SO:gene
	#puts triple(gene_id, "genome:class", "genome:Gene")

	puts triple(gene_id, "dcterms:isPartOf", @sequence_id)

	loc_id, _ = new_location(gene.position)
	puts triple(gene_id, "genome:location", loc_id)

	if hash["locus_tag"]
	locus_tag = hash["locus_tag"].first
	@locus[locus_tag] = gene_id
	puts triple(gene_id, "rdfs:label", quote(locus_tag))
	elsif hash["gene"]
	puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
	else
	# [TODO] Where else to find gene name?
	puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
	end
	count += 1

	parse_qualifiers(gene_id, hash)
	end
	end

	###
	### CDS
	###

	def parse_cds
	cdss = @features.select {\|x\| x.feature == "CDS"}

	count = 1
	cdss.each do \|cds\|
	cds_id = new_uuid
	hash = cds.to_hash

	puts triple(cds_id, "rdf:type", "obo:SO_0000316") # SO:CDS
	#puts triple(cds_id, "genome:class", "genome:CDS")

	if hash["locus_tag"]
	if locus_tag = hash["locus_tag"].first
	gene_id = @locus[locus_tag]
	end
	end

	if gene_id
	puts triple(cds_id, "dcterms:isPartOf", gene_id)
	else
	# [TODO] sure to do this?
	puts triple(cds_id, "dcterms:isPartOf", @sequence_id)
	end

	if locus_tag
	puts triple(cds_id, "rdfs:label", quote(locus_tag))
	elsif hash["gene"]
	puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
	else
	puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
	end
	count += 1

	loc_id, exons = new_location(cds.position, "obo:SO_0000147") # SO:exon
	puts triple(cds_id, "genome:location", loc_id)
	puts triple(cds_id, "genome:exons", "(#{exons.join(' ')})") # rdf:List

	parse_qualifiers(cds_id, hash)
	end
	end

	###
	### Features
	###

	def parse_features
	features = @features.select {\|x\| ! x.feature[/^(gene\|CDS)$/]}

	features.each do \|feat\|
	feature = feat.feature
	feature_id = new_uuid
	hash = feat.to_hash

	puts triple(feature_id, "dcterms:isPartOf", @sequence_id)
	puts triple(feature_id, "rdfs:label", quote(feature))

	if so_id = @ft_so.so_id(feature)
	if so_id != "undefined"
	so = so_id.sub(':', '_')
	# SO:#{@ft_so.so_term(feature)}
	puts triple(feature_id, "rdf:type", "obo:#{so}")
	else
	# SO:sequence_feature
	puts triple(feature_id, "rdf:type", "obo:SO_0000110")
	end
	end

	loc_id, _ = new_location(feat.position)
	puts triple(feature_id, "genome:location", loc_id)

	parse_qualifiers(feature_id, hash)
	end
	end

	def parse_qualifiers(feature_id, hash)
	hash.each do \|qual, vals\|
	vals.each do \|val\|
	if val == true
	puts triple(feature_id, "genome:feature_#{qual}", true)
	else
	data = val.to_s.gsub(/\s+/, ' ').strip
	case qual
	when "protein_id"
	uri = "<#{@link[:ncbi_protein]}#{val}>"
	puts triple(feature_id, "rdfs:seeAlso", uri)
	puts triple(uri, "edgedb:database", "edgedb:Protein") if $edgedb
	when "db_xref"
	case val
	when /GI:/
	ncbi_gi = data[/GI:(\d+)/, 1]
	uri = "<#{@link[:ncbi_gi]}#{ncbi_gi}>"
	puts triple(feature_id, "genome:dbxref", uri)
	puts triple(uri, "edgedb:database", "edgedb:NcbiGI") if $edgedb
	when /GeneID:/
	ncbi_geneid = data[/GeneID:(\d+)/, 1]
	uri = "<#{@link[:ncbi_geneid]}#{ncbi_geneid}>"
	puts triple(feature_id, "genome:dbxref", uri)
	puts triple(uri, "edgedb:database", "edgedb:NcbiGeneID") if $edgedb
	else
	$stderr.puts "edgedb:#{val.sub(/:.*/,'')} is missing!!"
	puts triple(feature_id, "genome:dbxref", quote(val))
	end
	else
	if data[/^\d+$/]
	puts triple(feature_id, "genome:feature_#{qual}", data)
	else
	puts triple(feature_id, "genome:feature_#{qual}", quote(data))
	end
	end
	end
	end
	end
	end

	end


	if __FILE__ == $0
	require 'getoptlong'

	args = GetoptLong.new(
	[ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
	[ '--seqname', '-n', GetoptLong::REQUIRED_ARGUMENT ],
	)

	opts = {}

	args.each_option do \|name, value\|
	case name
	when /--seqtype/
	opts[:seqtype] = value
	when /--seqname/
	opts[:seqname] = value
	end
	end

	RefSeq2RDF.new(ARGF, opts[:seqtype], opts[:seqname])
	end

	# obj = FT_SO.new
	# puts obj.so_id("-10_signal")
	# exit

	=begin
	genome:Region
	rdf:type rdfs:Class ;
	rdfs:label "Sequence" ;

	genome:Contig
	rdf:type rdfs:Class ;
	rdfs:label "Contig" ;

	genome:Gene
	rdf:type rdfs:Class ;
	rdfs:label "Gene" ;

	genome:length
	rdf:type rdf:Property ;
	rdfs:domain genome:Region ;
	rdfs:range xsd:int .
	=end
No results found