Last active
August 29, 2015 14:02
-
-
Save tfuji/f357c77e63c43092b55f to your computer and use it in GitHub Desktop.
genome_reports2ttl_v2.rb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# convert genome_reprots to RDF | |
# * ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt | |
# * ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt | |
# | |
require 'date' | |
def quote(str) | |
return str.to_s.gsub('\\', '\\\\').gsub("\t", '\\t').gsub("\n", '\\n').gsub("\r", '\\r').gsub('"', '\\"').inspect | |
end | |
def resource_date(str) | |
return quote(str) | |
#str = '0001/01/01' if str == '-' | |
#return quote(Date.parse(str).strftime("%Y-%m-%d")) +"^^xsd:date" | |
end | |
def output_prefix | |
puts "@prefix obo: <http://purl.obolibrary.org/obo/> ." | |
puts | |
puts | |
end | |
def status2so str | |
case str | |
when "Contig" | |
"obo:SO_0000149" | |
when "Gapless Chromosome" | |
"obo:SO_0000340" | |
when "Complete" | |
"obo:SO_0000148" | |
when "Scaffold" | |
"obo:SO_0000148" | |
when "Chromosome" | |
"obo:SO_0000340" | |
when "Chromosome with gaps" | |
"obo:SO_0000340" | |
else | |
warn "undefied status: #{k}" | |
raise error | |
end | |
end | |
def output_pv k,v | |
case k | |
when 'Organism/Name' | |
puts "\t:organism_name\t#{quote(v)} ;" | |
when 'TaxID' | |
puts "\t:tax_id\t#{quote(v)} ;" | |
puts "\t:taxon\t<http://identifiers.org/taxonomy/#{v}> ;" if v !='-' | |
when 'BioProject Accession' | |
puts "\t:bioproject_accession\t#{quote(v)} ;" | |
puts "\t:bioproject\t<http://identifiers.org/bioproject/#{v}> ;" | |
when 'BioProject ID' | |
puts "\t:bioproject_id\t#{quote(v)} ;" | |
when 'Group' | |
puts "\t:group\t#{quote(v)} ;" | |
when 'SubGroup' | |
puts "\t:subgroup\t#{quote(v)} ;" | |
when 'Size (Mb)' | |
puts "\t:size\t#{quote(v)} ;" | |
when 'GC%' | |
puts "\t:gc\t#{quote(v)} ;" | |
when 'Assembly Accession' | |
puts "\t:assembly_accession\t#{quote(v)} ;" | |
when 'Chromosomes' | |
puts "\t:chromosomes\t#{quote(v)} ;" | |
when 'Organelles' | |
puts "\t:organelles\t#{quote(v)} ;" | |
when 'Plasmids' | |
puts "\t:plasmids\t#{quote(v)} ;" | |
when 'WGS' | |
puts "\t:wgs\t#{quote(v)} ;" | |
when 'Scaffolds' | |
puts "\t:scaffolds\t#{quote(v)} ;" | |
when 'Genes' | |
puts "\t:genes\t#{quote(v)} ;" | |
when 'Proteins' | |
puts "\t:proteins\t#{quote(v)} ;" | |
when 'Release Date' | |
puts "\t:release_date\t#{resource_date(v)} ;" | |
when 'Modify Date' | |
puts "\t:modify_date\t#{resource_date(v)} ;" | |
when 'Status' | |
puts "\t:status\t#{quote(v)} ;" | |
puts "\t:status2so\t#{status2so(v)} ;" | |
when 'Center' | |
puts "\t:center\t#{quote(v)} ;" | |
when 'BioSample Accession' | |
puts "\t:biosample_accession\t#{quote(v)} ;" | |
puts "\t:biosample\t<http://identifiers.org/biosample/#{v}> ;" if v != '-' | |
when 'Chromosomes/RefSeq' | |
puts "\t:chromosomes_refseq\t#{quote(v)} ; #only prokaryotes" | |
v.split(",").each { |vv| puts "\t:chromosome\t<http://identifiers.org/refseq/#{vv}> ;"} if v != '-' | |
when 'Chromosomes/INSDC' | |
puts "\t:chromosomes_insdc\t#{quote(v)} ; #only prokaryotes" | |
when 'Plasmids/RefSeq' | |
puts "\t:plasmids_refseq\t#{quote(v)} ; #only prokaryotes" | |
v.split(",").each { |vv| puts "\t:plasmid\t<http://identifiers.org/refseq/#{vv}> ;"} if v != '-' | |
when 'Plasmids/INSDC' | |
puts "\t:plasmids_insdc\t#{quote(v)} ; #only prokaryotes" | |
when 'Reference' | |
puts "\t:reference\t#{quote(v)}; #only prokaryotes" | |
when 'FTP Path' | |
puts "\t:ftp_path\t#{quote(v)}; #only prokaryotes" | |
when 'Pubmed ID | |
puts "\t:pubmed_id\t#{quote(v)} ; #only prokaryotes" | |
else | |
puts " when '#{k}'" | |
warn "undefied key: #{k}" | |
raise error | |
end | |
end | |
ary =[] | |
%w(GENOME_REPORTS/prokaryotes.txt GENOME_REPORTS/eukaryotes.txt).each do |input_file| | |
head = [] | |
File.readlines(input_file).each_with_index do |line,i| | |
if i == 0 | |
head =line.strip.gsub("\r","").gsub(/^#/,"").split("\t") | |
else | |
ary << head.zip(line.strip.split("\t")).inject({}){|h,col| h[col[0]]=col[1];h} | |
end | |
end | |
end | |
# {"Contig"=>12234, "Gapless Chromosome"=>2935, "Complete"=>25, "Scaffold"=>11279, "Chromosome"=>533, "Chromosome with gaps"=>339} | |
output_prefix | |
status = Hash.new{|h,k|h[k]=0} | |
ary.each do |project| | |
acc = project["BioProject Accession"] | |
puts "<http://identifiers.org/bioproject/#{acc}>" | |
project.each do |k,v| | |
output_pv(k,v) | |
end | |
puts "." | |
status[project["Status"]] += 1 | |
end | |
warn status | |
# TSV format errors in prokaryotes.txt | |
#< #Organism/Name TaxID BioProject Accession BioProject ID Group SubGroup Size (Mb) GC% Chromosomes/RefSeq Chromosomes/INSDC Plasmids/RefSeq Plasmids/INSDC WGS Scaffolds Genes Proteins Release Date Modify Date Status Center BioSample Accession Assembly Accession Reference FTP Path Pubmed ID | |
#> #Organism/Name TaxID BioProject Accession BioProject ID Group SubGroup Size (Mb) GC% Chromosomes/RefSeq Chromosomes/INSDC Plasmids/RefSeq Plasmids/INSDC WGS Scaffolds Genes Proteins Release Date Modify Date Status Center BioSample Accession Assembly Accession Reference FTP Path Pubmed ID | |
#< "The Federal Goverment Health Institution ""Stavropol Plague Control Reseach Institute"" of the Federal Service for Supervision in the Sphere of Consumer Rights Protection and Human Welfare" | |
#> The Federal Goverment Health Institution "Stavropol Plague Control Reseach Institute" of the Federal Service for Supervision in the Sphere of Consumer Rights Protection and Human Welfare | |
#< """National Center for Biotechnology"" RSE" | |
#> "National Center for Biotechnology" RSE | |
#< "1Centre ""Bioengineering"" of Russian Academy of Sciences" | |
#> 1Centre "Bioengineering" of Russian Academy of Sciences | |
## peokaryotes | |
#["Organism/Name", "Campylobacter jejuni subsp. jejuni CG8421"] | |
#["TaxID", "478547"] | |
#["BioProject Accession", "PRJNA21037"] | |
#["BioProject ID", "21037"] | |
#["Group", "Proteobacteria"] | |
#["SubGroup", "delta/epsilon subdivisions"] | |
#["Size (Mb)", "1.60894"] | |
#["GC%", "30.3"] | |
#["Chromosomes/RefSeq", "-"] | |
#["Chromosomes/INSDC", "-"] | |
#["Plasmids/RefSeq", "-"] | |
#["Plasmids/INSDC", "-"] | |
#["WGS", "ABGQ01"] | |
#["Scaffolds", "20"] | |
#["Genes", "1590"] | |
#["Proteins", "1512"] | |
#["Release Date", "2008/09/19"] | |
#["Modify Date", "2014/01/08"] | |
#["Status", "Contig"] | |
#["Center", "Naval Medical Research Center"] | |
#["BioSample Accession", "SAMN02470701"] | |
#["Assembly Accession", "GCA_000171795.1"] | |
#["Reference", "-"] | |
#["FTP Path", "Campylobacter_jejuni/GCF_000171795"] | |
#["Pubmed ID", "18809665"] | |
# | |
#eukaryotes | |
#["Organism/Name", "Emiliania huxleyi CCMP1516"] | |
#["TaxID", "280463"] | |
#["BioProject Accession", "PRJNA77753"] | |
#["BioProject ID", "77753"] | |
#["Group", "Protists"] | |
#["SubGroup", "Other Protists"] | |
#["Size (Mb)", "167.676"] | |
#["GC%", "64.5"] | |
#["Assembly Accession", "GCA_000372725.1"] | |
#["Chromosomes", "-"] | |
#["Organelles", "-"] | |
#["Plasmids", "-"] | |
#["WGS", "AHAL01"] | |
#["Scaffolds", "7795"] | |
#["Genes", "38549"] | |
#["Proteins", "38554"] | |
#["Release Date", "2013/04/19"] | |
#["Modify Date", "2013/07/08"] | |
#["Status", "Scaffold"] | |
#["Center", "JGI"] | |
#["BioSample Accession", "-"] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment