Created
May 12, 2011 09:14
-
-
Save inutano/968221 to your computer and use it in GitHub Desktop.
explore SRA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require "nokogiri" | |
hako = "/path/to/directory_contains_pubmed_html" | |
title_list = [] | |
Dir::entries(hako).each do |item| | |
if !item.include?("url_list") | |
if !item.include?("¥.") | |
kitanaihtml = Nokogiri::HTML(open("#{hako}/#{item}")) | |
title = (kitanaihtml/"title").inner_text | |
title_list.push(title) | |
end | |
end | |
end | |
title_list.each do |title| | |
puts title.gsub(/( - PubMed result)$/,"") | |
end | |
journal_list = [] | |
title_list.each do |title| | |
title =‾ %r|¥[(.*)¥]| | |
journal = $1 | |
if journal | |
journal_list.push(journal) | |
end | |
end | |
journal_list.sort.each do |journal| | |
puts journal.gsub(/(.....)$/,"") | |
end | |
year_list = [] | |
title_list.each do |title| | |
title =‾ %r|¥[.*(....)¥]| | |
year = $1 | |
if year | |
year_list.push(year) | |
end | |
end | |
puts year_list.sort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require "open-uri" | |
require "json" | |
require "nokogiri" | |
require "pp" | |
def sratohananndattanoka(kaneganaitoka,nantokasiroya) # argument: organism, study_type | |
mishima = "http://trace.ddbj.nig.ac.jp/DRASearch/" | |
ground_floor = "#{mishima}query?organism=#{kaneganaitoka}&study_type=#{nantokasiroya}&show=100" | |
deeper_underground = "#{mishima}query?organism=#{kaneganaitoka}&study_type=#{nantokasiroya}&show=100&page=2" | |
list_study = URI(ground_floor).read.scan(/href="(study¥?acc=.*)" target/).flatten | |
list_study2 = URI(deeper_underground).read.scan(/href="(study¥?acc=.*)" target/).flatten | |
full_list = list_study + list_study2 | |
# save full sample list as json format file | |
sample_id_full_list = full_list.map { |study_url| | |
URI("#{mishima}#{study_url}").read.scan(/href="(sample¥?acc=.*)" target/) | |
}.flatten | |
n = kaneganaitoka.gsub("+","_") | |
s = nantokasiroya.gsub("+","_") | |
open("./#{n}_#{s}_sample_id.json","w") { |f| f.puts JSON.dump(sample_id_full_list) } | |
return sample_id_full_list | |
end | |
def mouNCBInantesiruka(majide) # argument: sampleid (e.g. "sample?acc=SRAXXXXXX") | |
begin | |
mishima = "http://trace.ddbj.nig.ac.jp/DRASearch/" | |
convert = URI("#{mishima}#{majide}").read.scan(/href="(submission¥?acc=.*)" target/).flatten.join("") | |
sub_id = convert.gsub("submission¥?acc=","") | |
sub_id_index = sub_id.slice(0,6) | |
xmldono = File.read("/Users/iNut/togofarm/xmldono/Submissions/#{sub_id_index}/#{sub_id}/#{sub_id}.study.xml") | |
nakami = Nokogiri::XML(xmldono) | |
entrez_link_db = (nakami/"ENTREZ_LINK"/"DB").inner_text | |
entrez_link_id = (nakami/"ENTREZ_LINK"/"ID").inner_text | |
return entrez_link_db, entrez_link_id | |
rescue | |
"no data" | |
end | |
end | |
def ronbundasumadegasequencedesu(ronbunronbun) | |
pmid_list = [] | |
ronbunronbun.each do |pair| | |
if pair[0].include?("pubmed") | |
if pair[0] =‾ %r|^pubmed| | |
pmid = pair[1].slice(0,8) | |
pmid_list.push(pmid) | |
elsif pair[0] =‾ %r|pubmed$| | |
pmid = pair[1].scan(/.*(........)$/).join("") | |
pmid_list.push(pmid) | |
end | |
end | |
end | |
#pp pmid_list.uniq | |
return pmid_list.uniq.map { |id| | |
"http://www.ncbi.nlm.nih.gov/pubmed/" + id | |
} | |
end | |
if __FILE__ == $0 | |
# organism = "Homo+sapiens" | |
organism = "Mus+musculus" | |
study_type = "Transcriptome+Analysis" | |
n = organism.gsub("+","_") | |
s = study_type.gsub("+","_") | |
sample_list = sratohananndattanoka(organism,study_type) | |
result = [] | |
sample_list.each do |id| | |
result.push(mouNCBInantesiruka(id)) | |
end | |
sorted = result.uniq | |
open("#{n}_#{s}_db_id.json","w") { |f| JSON.dump(sorted, f) } | |
puts ronbundasumadegasequencedesu(sorted) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment