Skip to content

Instantly share code, notes, and snippets.

@rbuels
Created October 5, 2009 23:33
Show Gist options
  • Select an option

  • Save rbuels/202596 to your computer and use it in GitHub Desktop.

Select an option

Save rbuels/202596 to your computer and use it in GitHub Desktop.
package CXGN::TomatoGenome::GenbankImport;
use strict;
use warnings;
use English;
use Carp;
=head1 NAME
CXGN::TomatoGenome::GenbankImport
=head1 SYNOPSIS
=head1 DESCRIPTION
=cut
use Bio::DB::GenBank;
use CXGN::Genomic::CloneIdentifiers; #< do not import anything,
#namespace::autoclean is not
#available yet for deployment
use Data::Dumper;
use Params::Validate ();
use List::MoreUtils qw/ any /;
use base qw/Class::Data::Inheritable/;
__PACKAGE__->mk_classdata('verbose');
my $d = CXGN::Debug->new;
sub vsay {
my $class = shift;
print @_,"\n" if $class->verbose || $d->get_debug;
}
=head1 CLASS METHODS
=cut
=head2 import_from_genbank
Status : public
Usage : CXGN::TomatoGenome::GenbankImport
->import_from_genbank_text_query( $chado,
'RHPOTKEY or POTGEN' );
Returns : nothing meaningful
Args : hash-style list as:
chado => Bio::Chado::Schema object,
query => genbank query text (e.g. 'RHPOTKEY or POTGEN or RH89-039-16'),
project_name => either project name string (e.g. 'acgt-roe') or a
subroutine ref that figures out the project name when
passed a Bio::Seq::RichSeq object
project_country => (optional) string (e.g. 'US') or coderef, as above,
defaults to builtin project country inference routine,
clone_obj => string or subroutine ref to find the proper Clone object
from a given Bio::Seq::RichSeq from GenBank,
verbose => 0 or 1, default off. if true, prints information to stdout
about what is being loaded.
Side Eff: dies on error, and does not actually commit the loading
transaction if CXGN_DEBUG is set true.
Takes a L<Bio::Chado::Schema> object, and a text genbank query
string, and loads the resulting records it finds into the given
Chado database.
=cut
sub import_from_genbank {
my $class = shift;
my %a = Params::Validate::validate(
@_,
{
chado => {
can => [qw[resultset storage txn_do txn_rollback]],
},
query => {
type => Params::Validate::SCALAR,
},
project_country => {
default => \&_infer_project_country_from_gb_richseq,
type => Params::Validate::CODEREF | Params::Validate::SCALAR,
},
project_name => {
type => Params::Validate::CODEREF | Params::Validate::SCALAR,
},
clone_object => {
default => \&_infer_clone_obj_from_gb_richseq,
type => Params::Validate::CODEREF | Params::Validate::SCALAR,
},
verbose => 0,
},
);
$class->verbose( $a{verbose} || $d->get_debug);
my $chado = delete $a{chado};
my %handlers;
foreach my $k ( 'project_name',
'project_country',
'clone_object'
) {
my $a = $a{$k};
if( ref($a) eq 'CODE' ) {
$handlers{$k} = $a;
} else {
$handlers{$k} = sub { $a }
}
}
# set up two genbank handles, one that gets records with no seqs, and
# one that gets just fasta seqs
my $gb_recs = Bio::DB::GenBank->new(-retrievaltype => 'tempfile' ,
#-format => 'Fasta',
#don't download any of the sequences
-seq_start => 1, -seq_stop => 1,
);
my $gb_fasta = Bio::DB::GenBank->new( -retrievaltype => 'tempfile',
-format => 'Fasta',
);
$class->vsay("querying GenBank for clone records matching '$a{query}'...");
my $bac_recs = $gb_recs->get_Stream_by_query( Bio::DB::Query::GenBank->new
( -db => 'nucleotide',
#-query => 'AC236750',
-query => $a{query},
)
);
my $count = 0;
while ( my $seq = $bac_recs->next_seq ) {
sleep 2; #< wait a couple of seconds between genbank queries
# find the record for the BAC associated with this seq
my $clone = $handlers{clone_object}->( $seq );
unless( $clone ) {
warn "cannot find clone record for ".$seq->accession_number.", skipping.\n";
die Dumper $seq;
next;
}
ref($clone) && $clone->can('genbank_accession')
or die "invalid clone '$clone'";
my $project_name = $handlers{project_name}->( $seq );
my $project_country = $handlers{project_country}->( $seq );
my $htgs_phase = _infer_htgs_phase_from_gb_richseq( $seq );
my $upstream_accession = $seq->accession_number;
my $upstream_version = $seq->version;
# find most recent accession for this BAC
if ( my $current_accession = $clone->genbank_accession ) {
# compare to this one. if up to date, next.
my ($our_version) = $current_accession =~ /\.(\d+)$/
or die "error parsing genbank accession $current_accession";
if ( $upstream_version > $our_version ) {
$class->vsay( $clone->clone_name.": current stored version $current_accession, loading new genbank seq version $upstream_accession.$upstream_version" );
} else {
$class->vsay( $clone->clone_name.": current stored version $current_accession is >= upstream version $upstream_accession.$upstream_version, skipping" );
next;
}
} else {
$class->vsay( $clone->clone_name.": no current sequence, loading upstream $upstream_accession.$upstream_version" );
}
# fetch the full genbank seq for this one
my $gb_seq = $gb_fasta->get_Seq_by_gi( $seq->primary_id )
or die 'could not fetch GI '.$seq->primary_id.":\n".Data::Dumper::Dumper $seq;
unless( $gb_seq->length > 5_000 && $gb_seq->seq !~ /[<>]/ ) {
warn "failed to fetch seq for ".$seq->display_id." (gi ".$seq->primary_id."), skipping.\n";
next;
}
# figure out the new sol-style sequence version for this bac
my $parsed_ident = CXGN::Genomic::CloneIdentifiers::parse_clone_ident($clone->clone_name)
or die "could not parse clone identifier ".$clone->clone_name;
my $new_sol_seq_name = CXGN::Genomic::CloneIdentifiers::assemble_clone_ident
( versioned_bac_seq_no_chrom =>
{
%$parsed_ident,
version => ($clone->latest_sequence_version || 0) + 1,
}
);
# load it into the DB with the proper accession
# and update the public.clone_feature table
$chado->txn_do( sub {
my $potato = _infer_organism($chado,$seq);
my $bac_cvterm = bac_cvterm($chado);
my $gbacc = $seq->accession_number.'.'.$seq->version;
my $gi = $seq->primary_id;
# make a feature for it in the feature table
my $new_feature =
$potato->create_related('features',
{ name => $new_sol_seq_name,
uniquename => $new_sol_seq_name,
type_id => $bac_cvterm->cvterm_id,
residues => $gb_seq->seq,
seqlen => $gb_seq->length,
},
);
$new_feature->create_featureprops({ htgs_phase => $htgs_phase,
finished_seq => ($htgs_phase == 3 ? 1 : 0),
sequenced_by => $project_name,
project_country => $project_country,
description => "genbank_gi:$gi genbank_accession:$gbacc sequenced_by:$project_name project_country:$project_country htgs_phase:$htgs_phase",
},
{
autocreate => 1 },
);
# add a dbxref for its genbank accession
my $gbacc_dbx = $chado->resultset('General::Db')
->find_or_create({ name => 'DB:GenBank_Accession'})
->find_or_create_related('dbxrefs',
{ accession => $gbacc,
version => $seq->version,
}
);
$new_feature->add_to_secondary_dbxrefs( $gbacc_dbx );
# add a dbxref for its genbank GI
my $gi_dbx = $chado->resultset('General::Db')
->find_or_create({ name => 'DB:GenBank_GI'})
->find_or_create_related('dbxrefs',
{ accession => $gi,
}
);
$new_feature->add_to_secondary_dbxrefs( $gi_dbx );
# manually update the clone_feature table
$chado->storage->dbh_do(sub {
my ($s,$dbh) = @_;
$dbh->do('delete from public.clone_feature where clone_id = ?',
undef,
$clone->clone_id,
);
$dbh->do('insert into public.clone_feature (feature_id,clone_id) values (?,?)',
undef,
$new_feature->feature_id,
$clone->clone_id,
);
});
# don't actually do anything to the db if debug is on
$chado->storage->txn_rollback if $d->get_debug;
});
#print Dumper $seq;
#die unless $clone_name;
# print join "\t", $seq->id, $clone_name || 'UNKNOWN', $seq->accession_number, $seq->version;
# print "\n";
$count++;
# for each seq, we need to update:
# - mapping between gb accession and bac name
# - bac sequence
}
$class->vsay( "loaded $count seqs\n" );
}
sub _infer_project_country_from_gb_richseq {
my $seq = shift;
my @project_signatures = @_;
unless( @project_signatures ) {
@project_signatures =
(
[ IE => qr/Ireland/i ],
[ CN => qr/China/, qr/Qu,D/, qr/Du,Y/, qr/He,J/,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment