nylander · February 13, 2017 09:06
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/perl 
 #===============================================================================
 =pod

 =head1
         FILE:  cgt.pl

        USAGE:  ./cgt.pl

  DESCRIPTION:  The intention of the original code was to search genbank
                taxonomy for exact (case insensitive) match
                of supplied search string (representing a scientific
                name), and return a Taxon ID ("uid").

         BUGS:  get_taxonids() returns an incorrect uid!
                It returns a taxid from a match on the 'species' (second
                word in search string) only and not the full binomen.
                This happens if there is no match on the initial, two-
                word string.
                This seems to be a highly undesireable, and unexpected,
                behaviour.

                The workaround seems to be to attach a '[', or 
                '[Scientific Name]' to the search string.
                This is, as far as I can tell, undocumented, and the use
                without the extra string will, apparently, return undesired
                ("wrong"!) values.


        NOTES:  Example with six taxa:

                String                 | What I expect | What I get
                ---------------------- | ------------- | ----------
                'Lissotriton vulgaris' | 8324          | 8324      
                'Chlorella vulgaris'   | 3077          | 3077      
                'Phygadeuon solidus'   | 1763951       | 1763951   
                'Ovatus'               | 666060        | 666060    
                'Phygadeuon ovatus'    | "No hit"      | 666060    
                'Trimorus ovatus'      | "No hit"      | 666060    

                The last two strings (binomen) does not occur on NCBI's taxonomy,
                yet get_taxonid() gives me an uid: '666060'. This uid,
                is the result of a match on the second part of the string
                ('ovatus'), but the returned uid is actually the uid for the
                genus 'Ovatus'!

                What I was expecting was the same behaviour when using, e.g.,

                https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus%5BScientific+Name%5D
                https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+ovatus[Scientific+Name]
                https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+solidus[Scientific+Name]

                and not
                https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus
                https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+solidus


                WORKAROUND:
                https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus[


       AUTHOR:  Johan A. A. Nylander (JN)
      COMPANY:  NBIS/NRM
      VERSION:  1.0
      CREATED:  08/23/2011 09:41:52 AM CEST
     REVISION:  02/01/2017 02:23:13 PM

 =cut
 #===============================================================================

 use strict;
 use warnings;
 use Bio::DB::Taxonomy;
 use Data::Dumper;
 use Try::Tiny;

 my @strings = (
    'Lissotriton vulgaris',
    'Chlorella vulgaris',
    'Phygadeuon solidus',
    'Phygadeuon solidus[',
    'Ovatus',
    'Phygadeuon ovatus',
    'Phygadeuon ovatus[Scientific Name]',
    'Phygadeuon ovatus[All Names]',
    'Phygadeuon ovatus[]',
    'Phygadeuon ovatus[',
 );

 foreach my $string (@strings) {
    my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
    my @taxonids;
    #$string = $string . '['; # uncomment to apply the workaround
    try {
        (@taxonids) = $db->get_taxonids($string);
    }
    catch {
        warn "caught error: $_";
    };
    if (@taxonids) {
        print Dumper(@taxonids);
    }
    else {
        warn "No taxid found for: \'$string\'\n";
    }
    sleep 1;
 }

 __END__

 BioPerl v.1.6.924-5 installed using apt on ubuntu 16.10 
 Code from /usr/share/perl5/Bio/DB/Taxonomy/entrez.pm

 =head2 get_taxonids

 Title   : get_taxonids
 Usage   : my $taxonid = $db->get_taxonids('Homo sapiens');
 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
           string. Note that multiple taxonids can match to the same supplied
           name.
 Returns : array of integer ids in list context, one of these in scalar context
 Args    : string representing taxon's name

 =cut

 sub get_taxonids {
    my ($self,$query) = @_;
    my %p = $self->entrez_params;
    
    # queries don't work correctly with special characters, so get rid of them.
    if ($query =~ /<.+>/) {
        # queries with <something> will fail, so workaround by removing, doing
        # the query, getting multiple taxonids, then picking the one id that
        # has a parent node with a scientific_name() or common_names()
        # case-insensitive matching to the word(s) within <>
        $query =~ s/ <(.+?)>//; ## 02/01/2017 02:32:15 PM JN: Why the leading blank space before the first '<'?
                                ## My guess is that this regexp will fail
        my $desired_parent_name = lc($1);
        
        ID: foreach my $start_id ($self->get_taxonids($query)) {
            my $node = $self->get_taxon($start_id) || next ID;
            
            # walk up the parents until we hit a node with a named rank
            while (1) {
                my $parent_node = $self->ancestor($node) || next ID;
                my $parent_sci_name = $parent_node->scientific_name || next ID;
                my @parent_common_names = $parent_node->common_names;
                unless (@parent_common_names) {
                    # ensure we're not using a minimal-info cached version
                    $parent_node = $self->get_taxon(-taxonid => $parent_node->id, -full => 1);
                    @parent_common_names = $parent_node->common_names;
                }
                
                foreach my $name ($parent_sci_name, @parent_common_names) {
                    if (lc($name) eq $desired_parent_name) {
                        return wantarray() ? ($start_id) : $start_id;
                    }
                }
                
                my $parent_rank = $parent_node->rank || 'no rank';
                if ($parent_rank ne 'no rank') {
                    last;
                }
                else {
                    $node = $parent_node;
                }
            }
        }
        return;
    }
    $query =~ s/[\"\(\)]//g; # not an exhaustive list; these are just the ones I know cause problems
    $query =~ s/\s/+/g;
    
    my @data;
    if (defined $DATA_CACHE->{name_to_id}->{$query}) {
        @data = @{$DATA_CACHE->{name_to_id}->{$query}};
    }
    else {
        $p{'term'} = $query;

        my $twig = $self->_run_query($self->_build_url($EntrezGet, \%p));
        my $root = $twig->root;
        my $list = $root->first_child('IdList');
        @data = map { $_->text } $list->children('Id');
        
        $DATA_CACHE->{name_to_id}->{$query} = [@data];
    }
    
    return wantarray() ? @data : shift @data;
 }

 *get_taxonid = \&get_taxonids;
	#!/usr/bin/perl
	#===============================================================================
	=pod

	=head1
	FILE: cgt.pl

	USAGE: ./cgt.pl

	DESCRIPTION: The intention of the original code was to search genbank
	taxonomy for exact (case insensitive) match
	of supplied search string (representing a scientific
	name), and return a Taxon ID ("uid").

	BUGS: get_taxonids() returns an incorrect uid!
	It returns a taxid from a match on the 'species' (second
	word in search string) only and not the full binomen.
	This happens if there is no match on the initial, two-
	word string.
	This seems to be a highly undesireable, and unexpected,
	behaviour.

	The workaround seems to be to attach a '[', or
	'[Scientific Name]' to the search string.
	This is, as far as I can tell, undocumented, and the use
	without the extra string will, apparently, return undesired
	("wrong"!) values.


	NOTES: Example with six taxa:

	String \| What I expect \| What I get
	---------------------- \| ------------- \| ----------
	'Lissotriton vulgaris' \| 8324 \| 8324
	'Chlorella vulgaris' \| 3077 \| 3077
	'Phygadeuon solidus' \| 1763951 \| 1763951
	'Ovatus' \| 666060 \| 666060
	'Phygadeuon ovatus' \| "No hit" \| 666060
	'Trimorus ovatus' \| "No hit" \| 666060

	The last two strings (binomen) does not occur on NCBI's taxonomy,
	yet get_taxonid() gives me an uid: '666060'. This uid,
	is the result of a match on the second part of the string
	('ovatus'), but the returned uid is actually the uid for the
	genus 'Ovatus'!

	What I was expecting was the same behaviour when using, e.g.,

	https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus%5BScientific+Name%5D
	https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+ovatus[Scientific+Name]
	https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=Phygadeuon+solidus[Scientific+Name]

	and not
	https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus
	https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+solidus


	WORKAROUND:
	https://www.ncbi.nlm.nih.gov/taxonomy/?term=Phygadeuon+ovatus[


	AUTHOR: Johan A. A. Nylander (JN)
	COMPANY: NBIS/NRM
	VERSION: 1.0
	CREATED: 08/23/2011 09:41:52 AM CEST
	REVISION: 02/01/2017 02:23:13 PM

	=cut
	#===============================================================================

	use strict;
	use warnings;
	use Bio::DB::Taxonomy;
	use Data::Dumper;
	use Try::Tiny;

	my @strings = (
	'Lissotriton vulgaris',
	'Chlorella vulgaris',
	'Phygadeuon solidus',
	'Phygadeuon solidus[',
	'Ovatus',
	'Phygadeuon ovatus',
	'Phygadeuon ovatus[Scientific Name]',
	'Phygadeuon ovatus[All Names]',
	'Phygadeuon ovatus[]',
	'Phygadeuon ovatus[',
	);

	foreach my $string (@strings) {
	my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
	my @taxonids;
	#$string = $string . '['; # uncomment to apply the workaround
	try {
	(@taxonids) = $db->get_taxonids($string);
	}
	catch {
	warn "caught error: $_";
	};
	if (@taxonids) {
	print Dumper(@taxonids);
	}
	else {
	warn "No taxid found for: \'$string\'\n";
	}
	sleep 1;
	}

	__END__

	BioPerl v.1.6.924-5 installed using apt on ubuntu 16.10
	Code from /usr/share/perl5/Bio/DB/Taxonomy/entrez.pm

	=head2 get_taxonids

	Title : get_taxonids
	Usage : my $taxonid = $db->get_taxonids('Homo sapiens');
	Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
	string. Note that multiple taxonids can match to the same supplied
	name.
	Returns : array of integer ids in list context, one of these in scalar context
	Args : string representing taxon's name

	=cut

	sub get_taxonids {
	my ($self,$query) = @_;
	my %p = $self->entrez_params;

	# queries don't work correctly with special characters, so get rid of them.
	if ($query =~ /<.+>/) {
	# queries with <something> will fail, so workaround by removing, doing
	# the query, getting multiple taxonids, then picking the one id that
	# has a parent node with a scientific_name() or common_names()
	# case-insensitive matching to the word(s) within <>
	$query =~ s/ <(.+?)>//; ## 02/01/2017 02:32:15 PM JN: Why the leading blank space before the first '<'?
	## My guess is that this regexp will fail
	my $desired_parent_name = lc($1);

	ID: foreach my $start_id ($self->get_taxonids($query)) {
	my $node = $self->get_taxon($start_id) \|\| next ID;

	# walk up the parents until we hit a node with a named rank
	while (1) {
	my $parent_node = $self->ancestor($node) \|\| next ID;
	my $parent_sci_name = $parent_node->scientific_name \|\| next ID;
	my @parent_common_names = $parent_node->common_names;
	unless (@parent_common_names) {
	# ensure we're not using a minimal-info cached version
	$parent_node = $self->get_taxon(-taxonid => $parent_node->id, -full => 1);
	@parent_common_names = $parent_node->common_names;
	}

	foreach my $name ($parent_sci_name, @parent_common_names) {
	if (lc($name) eq $desired_parent_name) {
	return wantarray() ? ($start_id) : $start_id;
	}
	}

	my $parent_rank = $parent_node->rank \|\| 'no rank';
	if ($parent_rank ne 'no rank') {
	last;
	}
	else {
	$node = $parent_node;
	}
	}
	}
	return;
	}
	$query =~ s/[\"\(\)]//g; # not an exhaustive list; these are just the ones I know cause problems
	$query =~ s/\s/+/g;

	my @data;
	if (defined $DATA_CACHE->{name_to_id}->{$query}) {
	@data = @{$DATA_CACHE->{name_to_id}->{$query}};
	}
	else {
	$p{'term'} = $query;

	my $twig = $self->_run_query($self->_build_url($EntrezGet, \%p));
	my $root = $twig->root;
	my $list = $root->first_child('IdList');
	@data = map { $_->text } $list->children('Id');

	$DATA_CACHE->{name_to_id}->{$query} = [@data];
	}

	return wantarray() ? @data : shift @data;
	}

	*get_taxonid = \&get_taxonids;