Skip to content

Instantly share code, notes, and snippets.

@cknoxrun
Created January 16, 2013 21:52
Show Gist options
  • Save cknoxrun/4551284 to your computer and use it in GitHub Desktop.
Save cknoxrun/4551284 to your computer and use it in GitHub Desktop.
require 'zip/zipfilesystem'
require 'builder'
namespace :hmdb do
namespace :export do
DOWNLOAD_PATH = 'public/downloads'
desc "Run all export tasks"
task :all => [ :xml, :structures, :sequences ]
# desc 'Export all compounds to json format'
# task :json => [:environment] do
# temp_file = Tempfile.new('json')
#
# temp_file.puts Metabolite.exported.limit(10).to_json(:include => {
# # :synonyms => {:only=>:synonym},
# # :chem_kingdom => {:only=>:name},
# # :chem_super_class => {:only=>:name},
# # :chem_class => {:only=>:name},
# # :chem_sub_class => {:only=>:name},
# # :chem_substituents => {:only=>:name},
# # :growth_conditions => {:exclude=>:id},
# # :references => {:only=>[:altext,:pubmed_id]},
# # :pathways => {:only=>:name},
# :proteins => {:include => {
# # :synonyms => {:only => :synonym},
# # :enzyme_classes => {:only => :ec},
# # :go_classes => {:only => [:category, :description]},
# # :pfams => {:only => [:name,:identifier]},
# # :pathways => {:only => [:name, :kegg_map_id]},
# # :gene_sequence => {:only => :chain},
# # :protein_sequence => {:only => :chain}
# }}})
#
#
# temp_file.flush
# output_file = File.open(temp_file.path)
# cards = output_file.read
# temp_file.close
# output_file.close
#
# # Output the card to zip file
# write_zip("#{DOWNLOAD_PATH}/hmdb.json.zip", "hmdb.json", cards)
# GC.start
# end
desc 'Export all compounds to xml format'
<<<<<<< HEAD
task :xml,[:update_date] => [:environment,"xml:metabolites","xml:proteins"]
namespace :xml do
desc 'Export proteins to xml format'
task :proteins do
log = File.new("log/export_xml.log", "w")
puts "Packing Proteins"
progress = ProgressBar.new Protein.exported.count
zipfilename = "#{DOWNLOAD_PATH}/hmdb_proteins.zip"
Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile|
Protein.includes(:metabolites).exported.each do |m|
data = nil
begin
data = m.to_xml
zipfile.get_output_stream("#{m.id}.xml") do |f|
f << data
end
rescue
log.puts "Failed to add protein (#{m.id}) to archive"
end
progress.increment!
=======
task :xml,[:update_date] => [:environment] do |t, args|
log = File.new("log/export_xml.log", "w")
puts "Packing Proteins"
progress = ProgressBar.new Protein.exported.count
protein_file = "#{DOWNLOAD_PATH}/hmdb_proteins.zip"
Zip::ZipFile.open(protein_file, Zip::ZipFile::CREATE) do |zipfile|
Protein.exported.includes(:accession_numbers, :metabolites, :enzyme_classes, :gene_sequence,
:protein_sequence, :pfams, :go_classes, :pathways,
:references, :subcellular_locations, :synonyms).find_each do |m|
data = nil
begin
data = m.to_xml
zipfile.get_output_stream("#{m.id}.xml") do |f|
f << data
end
# zipfile.commit
rescue
log.puts "Failed to add protein (#{m.id}) to archive"
>>>>>>> master
end
end
File.chmod(0644, zipfilename)
end
<<<<<<< HEAD
end
desc 'Export metabolites to xml format'
task :metabolites do
puts "Packing Metabolites"
progress = ProgressBar.new Metabolite.exported.count
zipfilename = "#{DOWNLOAD_PATH}/hmdb_metabolites.zip"
Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile|
Metabolite.includes(:metabolite_protein_links, :proteins, :link_set, :taxonomy, :concentrations, :tissues, :ontology, :diseases).exported.find_each do |m|
=======
File.chmod(0644, protein_file)
puts "Packing Metabolites"
progress = ProgressBar.new Metabolite.exported.count
metabolites_file = "#{DOWNLOAD_PATH}/hmdb_metabolites.zip"
Zip::ZipFile.open(metabolites_file, Zip::ZipFile::CREATE) do |zipfile|
Metabolite.exported.includes(:metabolite_protein_links, :proteins, :link_set, :taxonomy, :concentrations, :tissues, :ontology, :diseases).find_each do |m|
>>>>>>> master
data = nil
# next if !zipfile.find_entry("#{m.id}.xml").nil?
begin
data = m.to_xml
zipfile.get_output_stream("#{m.id}.xml") do |f|
f << data
end
# zipfile.commit
rescue
log.puts "Failed to add metabolite (#{m.id}) to archive"
end
progress.increment!
end
end
<<<<<<< HEAD
File.chmod(0644, zipfilename)
=======
File.chmod(0644, metabolites_file)
GC.start
>>>>>>> master
end
end
<<<<<<< HEAD
#desc "Export Spectra"
#task :spectra => [:environment] do
# Spectra.all.each do |s|
# next if !s.compound.export
# if !s.spectra_image.original_filename.nil?
# filename = s.spectra_type.gsub(" ", "_").gsub("/","_")
# ext = ''
# if s.spectra_image.original_filename =~ /.*(\..*)/
# ext = $1
# else
# raise
# end
# peaklist = s.docs.where(:name => "Peak list").first
#
# write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + ext, File.open(s.spectra_image.path).read)
# if !peaklist.nil? && !peaklist.document.original_filename.nil?
# write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + "_peaks.txt", File.open(peaklist.document.path).read)
# end
# end
#
# end
#end
desc "Export structures (SDF)"
task :structures => [:environment] do
require 'compound_sdf_helper_helper'
zipfilename = "#{DOWNLOAD_PATH}/structures.zip"
puts "Building sdf file ( #{zipfilename} )"
progress = ProgressBar.new Metabolite.exported.count
Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile|
zipfile.get_output_stream("structures.sdf") do |f|
# Will give the compounds in order of id asc (which is what we want)
Metabolite.exported.find_each do |compound|
begin
progress.increment!
structure = CompoundSdfHelperHelper.compound_to_sdf(compound)
f.puts(structure) unless structure.blank?
rescue
$stderr.puts "Can not export sdf for #{compound.id} - error grabbing structure"
=======
desc "Export Spectra"
task :spectra => [:environment] do
# Spectra.all.each do |s|
# next if !s.compound.export
# if !s.spectra_image.original_filename.nil?
# filename = s.spectra_type.gsub(" ", "_").gsub("/","_")
# ext = ''
# if s.spectra_image.original_filename =~ /.*(\..*)/
# ext = $1
# else
# raise
# end
# peaklist = s.docs.where(:name => "Peak list").first
#
# write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + ext, File.open(s.spectra_image.path).read)
# if !peaklist.nil? && !peaklist.document.original_filename.nil?
# write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + "_peaks.txt", File.open(peaklist.document.path).read)
# end
# end
#
# end
end
desc "Export structures (SDF)"
task :structures => [:environment] do
require 'compound_sdf_helper_helper'
zipfilename = "#{DOWNLOAD_PATH}/structures.zip"
puts "Building sdf file ( #{zipfilename} )"
progress = ProgressBar.new Metabolite.exported.count
Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile|
zipfile.get_output_stream("structures.sdf") do |f|
# Will give the compounds in order of id asc (which is what we want)
Metabolite.exported.find_each do |compound|
begin
progress.increment!
structure = CompoundSdfHelperHelper.compound_to_sdf(compound)
f.puts(structure) unless structure.blank?
rescue
$stderr.puts "Can not export sdf for #{compound.hmdb_id} - error grabbing structure"
end
>>>>>>> master
end
end
end
puts "Zipping"
end
File.chmod(0644, zipfilename)
end
<<<<<<< HEAD
desc "Export molecule sequences (FASTA)"
task :sequences => [:environment] do
sequences = molecule_sequences()
=======
sequences = molecule_sequences()
[ :protein, :gene ].each do |sequence_type|
zipfile = "#{DOWNLOAD_PATH}/sequences/#{sequence_type}.fasta.zip"
Zip::ZipFile.open(zipfile, Zip::ZipFile::CREATE) do |zipfile|
zipfile.get_output_stream("#{sequence_type}.fasta") do |f|
f.puts(sequences[sequence_type])
end
end
File.chmod(0644, zipfile)
end
>>>>>>> master
[ :protein, :gene ].each do |sequence_type|
write_zip("#{DOWNLOAD_PATH}/sequences/#{sequence_type}.fasta.zip", "#{sequence_type}.fasta", sequences[sequence_type])
end
GC.start
end
<<<<<<< HEAD
=======
namespace :references do
desc "Reset Cache Pudmed Citations"
task :reset => [:environment] do
Reference.where("pubmed_id is not NULL").each do |ref|
puts "Ref ID: " + ref.id.to_s
ref.reference_text = annotate(ref.pubmed_id)
ref.save!
end
end
desc "Cache Pudmed Citations"
task :update => [:environment] do
Reference.where("pubmed_id is not NULL and reference_text IS NULL").each do |ref|
puts "Ref ID: " + ref.id.to_s
ref.reference_text = annotate(ref.pubmed_id)
# puts ref.altext
ref.save!
end
end
end
>>>>>>> master
end
def molecule_sequences()
protein_sequences = ''
gene_sequences = ''
Protein.exported.all.each do |molecule|
if !molecule.protein_sequence.blank?
# protein_sequences << molecule.protein_sequence.sub(/^>.*\n/,">hmdb_#{molecule.id} #{molecule.name}\n") << "\n"
protein_sequences << ">#{molecule.hmdbp_id} #{molecule.name}\n" #(#{molecule.met_ids})\n"
protein_sequences << molecule.protein_sequence.chain.to_fasta_sequence << "\n"
end
if !molecule.gene_sequence.blank?
# gene_sequences << molecule.gene_sequence.sub(/^>.*\n/,">hmdb_#{molecule.id} #{molecule.name}\n") << "\n"
gene_sequences << ">#{molecule.hmdbp_id} #{molecule.name}\n" #(#{molecule.met_ids})\n"
gene_sequences << molecule.gene_sequence.chain.to_fasta_sequence << "\n"
end
end
{ :gene => gene_sequences, :protein => protein_sequences }
end
def annotate(pubmed_id)
raise ArgumentError unless pubmed_id.to_i > 0
citation = ''
result = Bio::PubMed.efetch(pubmed_id)
# puts result.join("\n")
raise ArgumentError, pubmed_id if result.nil? || result =~ /Error occurred/
medline = Bio::MEDLINE.new(result.join("\n"))
begin
citation = medline.reference.format
rescue
puts pubmed_id
raise
end
# fix error in generating uniprot reference
if pubmed_id == 21051339
citation = "UniProt Consortium" + citation
end
return citation
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment