Last active
February 13, 2017 09:06
-
-
Save nylander/de0fe4f009314c1272523eb11a179910 to your computer and use it in GitHub Desktop.
Pitfall in get_taxonids(), Bio::DB::Taxonomy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#=============================================================================== | |
=pod | |
=head1 | |
FILE: cgt.pl | |
USAGE: ./cgt.pl | |
DESCRIPTION: The intention of the original code was to search genbank | |
taxonomy for exact (case insensitive) match | |
of supplied search string (representing a scientific | |
name), and return a Taxon ID ("uid"). | |
BUGS: get_taxonids() returns an incorrect uid! | |
It returns a taxid from a match on the 'species' (second | |
word in search string) only and not the full binomen. | |
This happens if there is no match on the initial, two- | |
word string. | |
This seems to be a highly undesireable, and unexpected, | |
behaviour. | |
The workaround seems to be to attach a '[', or | |
'[Scientific Name]' to the search string. | |
This is, as far as I can tell, undocumented, and the use | |
without the extra string will, apparently, return undesired | |
("wrong"!) values. | |
NOTES: Example with six taxa: | |
String | What I expect | What I get | |
---------------------- | ------------- | ---------- | |
'Lissotriton vulgaris' | 8324 | 8324 | |
'Chlorella vulgaris' | 3077 | 3077 | |
'Phygadeuon solidus' | 1763951 | 1763951 | |
'Ovatus' | 666060 | 666060 | |
'Phygadeuon ovatus' | "No hit" | 666060 | |
'Trimorus ovatus' | "No hit" | 666060 | |
The last two strings (binomen) does not occur on NCBI's taxonomy, | |
yet get_taxonid() gives me an uid: '666060'. This uid, | |
is the result of a match on the second part of the string | |
('ovatus'), but the returned uid is actually the uid for the | |
genus 'Ovatus'! | |
What I was expecting was the same behaviour when using, e.g., | |
https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus%5BScientific+Name%5D | |
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+ovatus[Scientific+Name] | |
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+solidus[Scientific+Name] | |
and not | |
https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus | |
https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+solidus | |
WORKAROUND: | |
https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus[ | |
AUTHOR: Johan A. A. Nylander (JN) | |
COMPANY: NBIS/NRM | |
VERSION: 1.0 | |
CREATED: 08/23/2011 09:41:52 AM CEST | |
REVISION: 02/01/2017 02:23:13 PM | |
=cut | |
#=============================================================================== | |
use strict; | |
use warnings; | |
use Bio::DB::Taxonomy; | |
use Data::Dumper; | |
use Try::Tiny; | |
my @strings = ( | |
'Lissotriton vulgaris', | |
'Chlorella vulgaris', | |
'Phygadeuon solidus', | |
'Phygadeuon solidus[', | |
'Ovatus', | |
'Phygadeuon ovatus', | |
'Phygadeuon ovatus[Scientific Name]', | |
'Phygadeuon ovatus[All Names]', | |
'Phygadeuon ovatus[]', | |
'Phygadeuon ovatus[', | |
); | |
foreach my $string (@strings) { | |
my $db = Bio::DB::Taxonomy->new(-source => 'entrez'); | |
my @taxonids; | |
#$string = $string . '['; # uncomment to apply the workaround | |
try { | |
(@taxonids) = $db->get_taxonids($string); | |
} | |
catch { | |
warn "caught error: $_"; | |
}; | |
if (@taxonids) { | |
print Dumper(@taxonids); | |
} | |
else { | |
warn "No taxid found for: \'$string\'\n"; | |
} | |
sleep 1; | |
} | |
__END__ | |
BioPerl v.1.6.924-5 installed using apt on ubuntu 16.10 | |
Code from /usr/share/perl5/Bio/DB/Taxonomy/entrez.pm | |
=head2 get_taxonids | |
Title : get_taxonids | |
Usage : my $taxonid = $db->get_taxonids('Homo sapiens'); | |
Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query | |
string. Note that multiple taxonids can match to the same supplied | |
name. | |
Returns : array of integer ids in list context, one of these in scalar context | |
Args : string representing taxon's name | |
=cut | |
sub get_taxonids { | |
my ($self,$query) = @_; | |
my %p = $self->entrez_params; | |
# queries don't work correctly with special characters, so get rid of them. | |
if ($query =~ /<.+>/) { | |
# queries with <something> will fail, so workaround by removing, doing | |
# the query, getting multiple taxonids, then picking the one id that | |
# has a parent node with a scientific_name() or common_names() | |
# case-insensitive matching to the word(s) within <> | |
$query =~ s/ <(.+?)>//; ## 02/01/2017 02:32:15 PM JN: Why the leading blank space before the first '<'? | |
## My guess is that this regexp will fail | |
my $desired_parent_name = lc($1); | |
ID: foreach my $start_id ($self->get_taxonids($query)) { | |
my $node = $self->get_taxon($start_id) || next ID; | |
# walk up the parents until we hit a node with a named rank | |
while (1) { | |
my $parent_node = $self->ancestor($node) || next ID; | |
my $parent_sci_name = $parent_node->scientific_name || next ID; | |
my @parent_common_names = $parent_node->common_names; | |
unless (@parent_common_names) { | |
# ensure we're not using a minimal-info cached version | |
$parent_node = $self->get_taxon(-taxonid => $parent_node->id, -full => 1); | |
@parent_common_names = $parent_node->common_names; | |
} | |
foreach my $name ($parent_sci_name, @parent_common_names) { | |
if (lc($name) eq $desired_parent_name) { | |
return wantarray() ? ($start_id) : $start_id; | |
} | |
} | |
my $parent_rank = $parent_node->rank || 'no rank'; | |
if ($parent_rank ne 'no rank') { | |
last; | |
} | |
else { | |
$node = $parent_node; | |
} | |
} | |
} | |
return; | |
} | |
$query =~ s/[\"\(\)]//g; # not an exhaustive list; these are just the ones I know cause problems | |
$query =~ s/\s/+/g; | |
my @data; | |
if (defined $DATA_CACHE->{name_to_id}->{$query}) { | |
@data = @{$DATA_CACHE->{name_to_id}->{$query}}; | |
} | |
else { | |
$p{'term'} = $query; | |
my $twig = $self->_run_query($self->_build_url($EntrezGet, \%p)); | |
my $root = $twig->root; | |
my $list = $root->first_child('IdList'); | |
@data = map { $_->text } $list->children('Id'); | |
$DATA_CACHE->{name_to_id}->{$query} = [@data]; | |
} | |
return wantarray() ? @data : shift @data; | |
} | |
*get_taxonid = \&get_taxonids; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment