Created
March 24, 2011 06:27
-
-
Save slowkow/884661 to your computer and use it in GitHub Desktop.
Query any NCBI database (nucleotide, protein, nucgss, etc.) and retrieve resulting records.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Author : Kamil Slowikowski <[email protected]> | |
# Date : March 23, 2011 | |
# Version : 0.1 | |
# Description : Modified code from | |
# http://www.bioperl.org/wiki/HOWTO:EUtilities_Cookbook | |
use strict; | |
use warnings; | |
use Bio::DB::EUtilities; | |
use Getopt::Long::Descriptive; | |
my ($opt, $usage) = describe_options( | |
'%c [options...]', | |
[ 'output|o=s', | |
"write returned data to this file, default is stdout", | |
{ default => 'stdout' } | |
], | |
[ 'query|q=s', | |
"query sent to NCBI", | |
{ default => '' } | |
], | |
[ 'db|d=s', | |
"NCBI database to query, default is nucgss", | |
{ default => 'nucgss' } # nucleotide | |
], | |
[ 'listdatabases|l', | |
"list all available databases and exit", | |
], | |
[ 'email|e=s', | |
"your email, so NCBI can can track your project " | |
. "and contact you if there is a problem", | |
{ default => '[email protected]' } | |
], | |
[ 'retstart|s=i', | |
"number of results to skip, default is 0", | |
{ default => 0 } | |
], | |
[ 'retmax|m=i', | |
"number of results to download at a time, default is 500", | |
{ default => 500 } | |
], | |
[ 'rettype|t=s', | |
"format of returned data, default is fasta", | |
{ default => 'fasta' } | |
], | |
[ 'retries|r=i', | |
"number of retries before giving up if the server doesn't respond", | |
{ default => 5 } | |
], | |
[], | |
[ 'help|h', "print usage message and exit" ], | |
); | |
# print a list of all available databases and exit | |
if ($opt->listdatabases) { | |
my $factory = Bio::DB::EUtilities->new( | |
-eutil => 'einfo', | |
-email => $opt->email, | |
); | |
print join(' ', sort $factory->get_available_databases), "\n"; | |
exit; | |
} | |
# print help if query is missing or help is requested | |
if ($opt->help || length $opt->query < 1) { | |
print($usage->text); | |
exit; | |
} | |
# create an ESearch object | |
my $factory = Bio::DB::EUtilities->new( | |
-eutil => 'esearch', | |
-db => $opt->db, | |
-term => $opt->query, | |
-email => $opt->email, | |
-usehistory => 'y', | |
); | |
# die if the query fails to be translated | |
die unless my $translation = $factory->get_query_translation; | |
# check how many results we got back | |
my $count = $factory->get_count; | |
warn "Query translates to '$translation'\n"; | |
warn "Found $count results\n"; | |
warn "Download and print all data? [no] \n"; | |
<STDIN> =~ /^ye?s?/i || exit; | |
# get history from queue | |
my $hist = $factory->next_History || die 'No history data returned'; | |
# note db carries over from above | |
$factory->set_parameters( | |
-eutil => 'efetch', | |
-rettype => $opt->rettype, | |
-history => $hist, | |
); | |
# count how many times we retry | |
my $retry = 0; | |
# how many results to download at a time, which result index to start with | |
my ($retmax, $retstart) = ($opt->retmax, $opt->retstart); | |
# either print to stdout or to the file specified | |
my $out; | |
if ($opt->output eq 'stdout') { | |
$out = \*STDOUT; | |
} else { | |
open($out, '>', $opt->output) || die "Can't open file: $!"; | |
} | |
RETRIEVE_RESULTS: | |
while ($retstart < $count) { | |
$factory->set_parameters( | |
-retmax => $retmax, | |
-retstart => $retstart | |
); | |
eval{ | |
$factory->get_Response( | |
-cb => sub { | |
my ($data) = @_; | |
print $out $data; | |
} | |
); | |
}; | |
if ($@) { | |
if ($retry == $opt->retries) { | |
# quit if we get to 5 retries | |
die "Server error: $@. Try again later"; | |
} else { | |
# otherwise, retry | |
warn "Server error, retry #", ++$retry, "\n"; | |
redo RETRIEVE_RESULTS; | |
} | |
} | |
warn "Retrieved $retstart-", ($retstart + $retmax), "\n"; | |
$retstart += $retmax; | |
} | |
close $out; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment