Created
January 16, 2013 21:51
-
-
Save cknoxrun/4551272 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'zip/zipfilesystem' | |
| require 'builder' | |
| namespace :hmdb do | |
| namespace :export do | |
| DOWNLOAD_PATH = 'public/downloads' | |
| desc "Run all export tasks" | |
| task :all => [ :xml, :structures, :sequences ] | |
| # desc 'Export all compounds to json format' | |
| # task :json => [:environment] do | |
| # temp_file = Tempfile.new('json') | |
| # | |
| # temp_file.puts Metabolite.exported.limit(10).to_json(:include => { | |
| # # :synonyms => {:only=>:synonym}, | |
| # # :chem_kingdom => {:only=>:name}, | |
| # # :chem_super_class => {:only=>:name}, | |
| # # :chem_class => {:only=>:name}, | |
| # # :chem_sub_class => {:only=>:name}, | |
| # # :chem_substituents => {:only=>:name}, | |
| # # :growth_conditions => {:exclude=>:id}, | |
| # # :references => {:only=>[:altext,:pubmed_id]}, | |
| # # :pathways => {:only=>:name}, | |
| # :proteins => {:include => { | |
| # # :synonyms => {:only => :synonym}, | |
| # # :enzyme_classes => {:only => :ec}, | |
| # # :go_classes => {:only => [:category, :description]}, | |
| # # :pfams => {:only => [:name,:identifier]}, | |
| # # :pathways => {:only => [:name, :kegg_map_id]}, | |
| # # :gene_sequence => {:only => :chain}, | |
| # # :protein_sequence => {:only => :chain} | |
| # }}}) | |
| # | |
| # | |
| # temp_file.flush | |
| # output_file = File.open(temp_file.path) | |
| # cards = output_file.read | |
| # temp_file.close | |
| # output_file.close | |
| # | |
| # # Output the card to zip file | |
| # write_zip("#{DOWNLOAD_PATH}/hmdb.json.zip", "hmdb.json", cards) | |
| # GC.start | |
| # end | |
| desc 'Export all compounds to xml format' | |
| <<<<<<< HEAD | |
| task :xml,[:update_date] => [:environment,"xml:metabolites","xml:proteins"] | |
| namespace :xml do | |
| desc 'Export proteins to xml format' | |
| task :proteins do | |
| log = File.new("log/export_xml.log", "w") | |
| puts "Packing Proteins" | |
| progress = ProgressBar.new Protein.exported.count | |
| zipfilename = "#{DOWNLOAD_PATH}/hmdb_proteins.zip" | |
| Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile| | |
| Protein.includes(:metabolites).exported.each do |m| | |
| data = nil | |
| begin | |
| data = m.to_xml | |
| zipfile.get_output_stream("#{m.id}.xml") do |f| | |
| f << data | |
| end | |
| rescue | |
| log.puts "Failed to add protein (#{m.id}) to archive" | |
| end | |
| progress.increment! | |
| ======= | |
| task :xml,[:update_date] => [:environment] do |t, args| | |
| log = File.new("log/export_xml.log", "w") | |
| puts "Packing Proteins" | |
| progress = ProgressBar.new Protein.exported.count | |
| protein_file = "#{DOWNLOAD_PATH}/hmdb_proteins.zip" | |
| Zip::ZipFile.open(protein_file, Zip::ZipFile::CREATE) do |zipfile| | |
| Protein.exported.includes(:accession_numbers, :metabolites, :enzyme_classes, :gene_sequence, | |
| :protein_sequence, :pfams, :go_classes, :pathways, | |
| :references, :subcellular_locations, :synonyms).find_each do |m| | |
| data = nil | |
| begin | |
| data = m.to_xml | |
| zipfile.get_output_stream("#{m.id}.xml") do |f| | |
| f << data | |
| end | |
| # zipfile.commit | |
| rescue | |
| log.puts "Failed to add protein (#{m.id}) to archive" | |
| >>>>>>> master | |
| end | |
| end | |
| File.chmod(0644, zipfilename) | |
| end | |
| <<<<<<< HEAD | |
| end | |
| desc 'Export metabolites to xml format' | |
| task :metabolites do | |
| puts "Packing Metabolites" | |
| progress = ProgressBar.new Metabolite.exported.count | |
| zipfilename = "#{DOWNLOAD_PATH}/hmdb_metabolites.zip" | |
| Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile| | |
| Metabolite.includes(:metabolite_protein_links, :proteins, :link_set, :taxonomy, :concentrations, :tissues, :ontology, :diseases).exported.find_each do |m| | |
| ======= | |
| File.chmod(0644, protein_file) | |
| puts "Packing Metabolites" | |
| progress = ProgressBar.new Metabolite.exported.count | |
| metabolites_file = "#{DOWNLOAD_PATH}/hmdb_metabolites.zip" | |
| Zip::ZipFile.open(metabolites_file, Zip::ZipFile::CREATE) do |zipfile| | |
| Metabolite.exported.includes(:metabolite_protein_links, :proteins, :link_set, :taxonomy, :concentrations, :tissues, :ontology, :diseases).find_each do |m| | |
| >>>>>>> master | |
| data = nil | |
| # next if !zipfile.find_entry("#{m.id}.xml").nil? | |
| begin | |
| data = m.to_xml | |
| zipfile.get_output_stream("#{m.id}.xml") do |f| | |
| f << data | |
| end | |
| # zipfile.commit | |
| rescue | |
| log.puts "Failed to add metabolite (#{m.id}) to archive" | |
| end | |
| progress.increment! | |
| end | |
| end | |
| <<<<<<< HEAD | |
| File.chmod(0644, zipfilename) | |
| ======= | |
| File.chmod(0644, metabolites_file) | |
| GC.start | |
| >>>>>>> master | |
| end | |
| end | |
| <<<<<<< HEAD | |
| #desc "Export Spectra" | |
| #task :spectra => [:environment] do | |
| # Spectra.all.each do |s| | |
| # next if !s.compound.export | |
| # if !s.spectra_image.original_filename.nil? | |
| # filename = s.spectra_type.gsub(" ", "_").gsub("/","_") | |
| # ext = '' | |
| # if s.spectra_image.original_filename =~ /.*(\..*)/ | |
| # ext = $1 | |
| # else | |
| # raise | |
| # end | |
| # peaklist = s.docs.where(:name => "Peak list").first | |
| # | |
| # write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + ext, File.open(s.spectra_image.path).read) | |
| # if !peaklist.nil? && !peaklist.document.original_filename.nil? | |
| # write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + "_peaks.txt", File.open(peaklist.document.path).read) | |
| # end | |
| # end | |
| # | |
| # end | |
| #end | |
| desc "Export structures (SDF)" | |
| task :structures => [:environment] do | |
| require 'compound_sdf_helper_helper' | |
| zipfilename = "#{DOWNLOAD_PATH}/structures.zip" | |
| puts "Building sdf file ( #{zipfilename} )" | |
| progress = ProgressBar.new Metabolite.exported.count | |
| Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile| | |
| zipfile.get_output_stream("structures.sdf") do |f| | |
| # Will give the compounds in order of id asc (which is what we want) | |
| Metabolite.exported.find_each do |compound| | |
| begin | |
| progress.increment! | |
| structure = CompoundSdfHelperHelper.compound_to_sdf(compound) | |
| f.puts(structure) unless structure.blank? | |
| rescue | |
| $stderr.puts "Can not export sdf for #{compound.id} - error grabbing structure" | |
| ======= | |
| desc "Export Spectra" | |
| task :spectra => [:environment] do | |
| # Spectra.all.each do |s| | |
| # next if !s.compound.export | |
| # if !s.spectra_image.original_filename.nil? | |
| # filename = s.spectra_type.gsub(" ", "_").gsub("/","_") | |
| # ext = '' | |
| # if s.spectra_image.original_filename =~ /.*(\..*)/ | |
| # ext = $1 | |
| # else | |
| # raise | |
| # end | |
| # peaklist = s.docs.where(:name => "Peak list").first | |
| # | |
| # write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + ext, File.open(s.spectra_image.path).read) | |
| # if !peaklist.nil? && !peaklist.document.original_filename.nil? | |
| # write_zip("#{DOWNLOAD_PATH}/spectra.zip", s.compound.met_id + "/" + filename + "_peaks.txt", File.open(peaklist.document.path).read) | |
| # end | |
| # end | |
| # | |
| # end | |
| end | |
| desc "Export structures (SDF)" | |
| task :structures => [:environment] do | |
| require 'compound_sdf_helper_helper' | |
| zipfilename = "#{DOWNLOAD_PATH}/structures.zip" | |
| puts "Building sdf file ( #{zipfilename} )" | |
| progress = ProgressBar.new Metabolite.exported.count | |
| Zip::ZipFile.open(zipfilename, Zip::ZipFile::CREATE) do |zipfile| | |
| zipfile.get_output_stream("structures.sdf") do |f| | |
| # Will give the compounds in order of id asc (which is what we want) | |
| Metabolite.exported.find_each do |compound| | |
| begin | |
| progress.increment! | |
| structure = CompoundSdfHelperHelper.compound_to_sdf(compound) | |
| f.puts(structure) unless structure.blank? | |
| rescue | |
| $stderr.puts "Can not export sdf for #{compound.hmdb_id} - error grabbing structure" | |
| end | |
| >>>>>>> master | |
| end | |
| end | |
| end | |
| puts "Zipping" | |
| end | |
| File.chmod(0644, zipfilename) | |
| end | |
| <<<<<<< HEAD | |
| desc "Export molecule sequences (FASTA)" | |
| task :sequences => [:environment] do | |
| sequences = molecule_sequences() | |
| ======= | |
| sequences = molecule_sequences() | |
| [ :protein, :gene ].each do |sequence_type| | |
| zipfile = "#{DOWNLOAD_PATH}/sequences/#{sequence_type}.fasta.zip" | |
| Zip::ZipFile.open(zipfile, Zip::ZipFile::CREATE) do |zipfile| | |
| zipfile.get_output_stream("#{sequence_type}.fasta") do |f| | |
| f.puts(sequences[sequence_type]) | |
| end | |
| end | |
| File.chmod(0644, zipfile) | |
| end | |
| >>>>>>> master | |
| [ :protein, :gene ].each do |sequence_type| | |
| write_zip("#{DOWNLOAD_PATH}/sequences/#{sequence_type}.fasta.zip", "#{sequence_type}.fasta", sequences[sequence_type]) | |
| end | |
| GC.start | |
| end | |
| <<<<<<< HEAD | |
| ======= | |
| namespace :references do | |
| desc "Reset Cache Pudmed Citations" | |
| task :reset => [:environment] do | |
| Reference.where("pubmed_id is not NULL").each do |ref| | |
| puts "Ref ID: " + ref.id.to_s | |
| ref.reference_text = annotate(ref.pubmed_id) | |
| ref.save! | |
| end | |
| end | |
| desc "Cache Pudmed Citations" | |
| task :update => [:environment] do | |
| Reference.where("pubmed_id is not NULL and reference_text IS NULL").each do |ref| | |
| puts "Ref ID: " + ref.id.to_s | |
| ref.reference_text = annotate(ref.pubmed_id) | |
| # puts ref.altext | |
| ref.save! | |
| end | |
| end | |
| end | |
| >>>>>>> master | |
| end | |
| def molecule_sequences() | |
| protein_sequences = '' | |
| gene_sequences = '' | |
| Protein.exported.all.each do |molecule| | |
| if !molecule.protein_sequence.blank? | |
| # protein_sequences << molecule.protein_sequence.sub(/^>.*\n/,">hmdb_#{molecule.id} #{molecule.name}\n") << "\n" | |
| protein_sequences << ">#{molecule.hmdbp_id} #{molecule.name}\n" #(#{molecule.met_ids})\n" | |
| protein_sequences << molecule.protein_sequence.chain.to_fasta_sequence << "\n" | |
| end | |
| if !molecule.gene_sequence.blank? | |
| # gene_sequences << molecule.gene_sequence.sub(/^>.*\n/,">hmdb_#{molecule.id} #{molecule.name}\n") << "\n" | |
| gene_sequences << ">#{molecule.hmdbp_id} #{molecule.name}\n" #(#{molecule.met_ids})\n" | |
| gene_sequences << molecule.gene_sequence.chain.to_fasta_sequence << "\n" | |
| end | |
| end | |
| { :gene => gene_sequences, :protein => protein_sequences } | |
| end | |
| def annotate(pubmed_id) | |
| raise ArgumentError unless pubmed_id.to_i > 0 | |
| citation = '' | |
| result = Bio::PubMed.efetch(pubmed_id) | |
| # puts result.join("\n") | |
| raise ArgumentError, pubmed_id if result.nil? || result =~ /Error occurred/ | |
| medline = Bio::MEDLINE.new(result.join("\n")) | |
| begin | |
| citation = medline.reference.format | |
| rescue | |
| puts pubmed_id | |
| raise | |
| end | |
| # fix error in generating uniprot reference | |
| if pubmed_id == 21051339 | |
| citation = "UniProt Consortium" + citation | |
| end | |
| return citation | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment