Created
October 5, 2009 23:33
-
-
Save rbuels/202596 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package CXGN::TomatoGenome::GenbankImport; | |
| use strict; | |
| use warnings; | |
| use English; | |
| use Carp; | |
| =head1 NAME | |
| CXGN::TomatoGenome::GenbankImport | |
| =head1 SYNOPSIS | |
| =head1 DESCRIPTION | |
| =cut | |
| use Bio::DB::GenBank; | |
| use CXGN::Genomic::CloneIdentifiers; #< do not import anything, | |
| #namespace::autoclean is not | |
| #available yet for deployment | |
| use Data::Dumper; | |
| use Params::Validate (); | |
| use List::MoreUtils qw/ any /; | |
| use base qw/Class::Data::Inheritable/; | |
| __PACKAGE__->mk_classdata('verbose'); | |
| my $d = CXGN::Debug->new; | |
| sub vsay { | |
| my $class = shift; | |
| print @_,"\n" if $class->verbose || $d->get_debug; | |
| } | |
| =head1 CLASS METHODS | |
| =cut | |
| =head2 import_from_genbank | |
| Status : public | |
| Usage : CXGN::TomatoGenome::GenbankImport | |
| ->import_from_genbank_text_query( $chado, | |
| 'RHPOTKEY or POTGEN' ); | |
| Returns : nothing meaningful | |
| Args : hash-style list as: | |
| chado => Bio::Chado::Schema object, | |
| query => genbank query text (e.g. 'RHPOTKEY or POTGEN or RH89-039-16'), | |
| project_name => either project name string (e.g. 'acgt-roe') or a | |
| subroutine ref that figures out the project name when | |
| passed a Bio::Seq::RichSeq object | |
| project_country => (optional) string (e.g. 'US') or coderef, as above, | |
| defaults to builtin project country inference routine, | |
| clone_obj => string or subroutine ref to find the proper Clone object | |
| from a given Bio::Seq::RichSeq from GenBank, | |
| verbose => 0 or 1, default off. if true, prints information to stdout | |
| about what is being loaded. | |
| Side Eff: dies on error, and does not actually commit the loading | |
| transaction if CXGN_DEBUG is set true. | |
| Takes a L<Bio::Chado::Schema> object, and a text genbank query | |
| string, and loads the resulting records it finds into the given | |
| Chado database. | |
| =cut | |
| sub import_from_genbank { | |
| my $class = shift; | |
| my %a = Params::Validate::validate( | |
| @_, | |
| { | |
| chado => { | |
| can => [qw[resultset storage txn_do txn_rollback]], | |
| }, | |
| query => { | |
| type => Params::Validate::SCALAR, | |
| }, | |
| project_country => { | |
| default => \&_infer_project_country_from_gb_richseq, | |
| type => Params::Validate::CODEREF | Params::Validate::SCALAR, | |
| }, | |
| project_name => { | |
| type => Params::Validate::CODEREF | Params::Validate::SCALAR, | |
| }, | |
| clone_object => { | |
| default => \&_infer_clone_obj_from_gb_richseq, | |
| type => Params::Validate::CODEREF | Params::Validate::SCALAR, | |
| }, | |
| verbose => 0, | |
| }, | |
| ); | |
| $class->verbose( $a{verbose} || $d->get_debug); | |
| my $chado = delete $a{chado}; | |
| my %handlers; | |
| foreach my $k ( 'project_name', | |
| 'project_country', | |
| 'clone_object' | |
| ) { | |
| my $a = $a{$k}; | |
| if( ref($a) eq 'CODE' ) { | |
| $handlers{$k} = $a; | |
| } else { | |
| $handlers{$k} = sub { $a } | |
| } | |
| } | |
| # set up two genbank handles, one that gets records with no seqs, and | |
| # one that gets just fasta seqs | |
| my $gb_recs = Bio::DB::GenBank->new(-retrievaltype => 'tempfile' , | |
| #-format => 'Fasta', | |
| #don't download any of the sequences | |
| -seq_start => 1, -seq_stop => 1, | |
| ); | |
| my $gb_fasta = Bio::DB::GenBank->new( -retrievaltype => 'tempfile', | |
| -format => 'Fasta', | |
| ); | |
| $class->vsay("querying GenBank for clone records matching '$a{query}'..."); | |
| my $bac_recs = $gb_recs->get_Stream_by_query( Bio::DB::Query::GenBank->new | |
| ( -db => 'nucleotide', | |
| #-query => 'AC236750', | |
| -query => $a{query}, | |
| ) | |
| ); | |
| my $count = 0; | |
| while ( my $seq = $bac_recs->next_seq ) { | |
| sleep 2; #< wait a couple of seconds between genbank queries | |
| # find the record for the BAC associated with this seq | |
| my $clone = $handlers{clone_object}->( $seq ); | |
| unless( $clone ) { | |
| warn "cannot find clone record for ".$seq->accession_number.", skipping.\n"; | |
| die Dumper $seq; | |
| next; | |
| } | |
| ref($clone) && $clone->can('genbank_accession') | |
| or die "invalid clone '$clone'"; | |
| my $project_name = $handlers{project_name}->( $seq ); | |
| my $project_country = $handlers{project_country}->( $seq ); | |
| my $htgs_phase = _infer_htgs_phase_from_gb_richseq( $seq ); | |
| my $upstream_accession = $seq->accession_number; | |
| my $upstream_version = $seq->version; | |
| # find most recent accession for this BAC | |
| if ( my $current_accession = $clone->genbank_accession ) { | |
| # compare to this one. if up to date, next. | |
| my ($our_version) = $current_accession =~ /\.(\d+)$/ | |
| or die "error parsing genbank accession $current_accession"; | |
| if ( $upstream_version > $our_version ) { | |
| $class->vsay( $clone->clone_name.": current stored version $current_accession, loading new genbank seq version $upstream_accession.$upstream_version" ); | |
| } else { | |
| $class->vsay( $clone->clone_name.": current stored version $current_accession is >= upstream version $upstream_accession.$upstream_version, skipping" ); | |
| next; | |
| } | |
| } else { | |
| $class->vsay( $clone->clone_name.": no current sequence, loading upstream $upstream_accession.$upstream_version" ); | |
| } | |
| # fetch the full genbank seq for this one | |
| my $gb_seq = $gb_fasta->get_Seq_by_gi( $seq->primary_id ) | |
| or die 'could not fetch GI '.$seq->primary_id.":\n".Data::Dumper::Dumper $seq; | |
| unless( $gb_seq->length > 5_000 && $gb_seq->seq !~ /[<>]/ ) { | |
| warn "failed to fetch seq for ".$seq->display_id." (gi ".$seq->primary_id."), skipping.\n"; | |
| next; | |
| } | |
| # figure out the new sol-style sequence version for this bac | |
| my $parsed_ident = CXGN::Genomic::CloneIdentifiers::parse_clone_ident($clone->clone_name) | |
| or die "could not parse clone identifier ".$clone->clone_name; | |
| my $new_sol_seq_name = CXGN::Genomic::CloneIdentifiers::assemble_clone_ident | |
| ( versioned_bac_seq_no_chrom => | |
| { | |
| %$parsed_ident, | |
| version => ($clone->latest_sequence_version || 0) + 1, | |
| } | |
| ); | |
| # load it into the DB with the proper accession | |
| # and update the public.clone_feature table | |
| $chado->txn_do( sub { | |
| my $potato = _infer_organism($chado,$seq); | |
| my $bac_cvterm = bac_cvterm($chado); | |
| my $gbacc = $seq->accession_number.'.'.$seq->version; | |
| my $gi = $seq->primary_id; | |
| # make a feature for it in the feature table | |
| my $new_feature = | |
| $potato->create_related('features', | |
| { name => $new_sol_seq_name, | |
| uniquename => $new_sol_seq_name, | |
| type_id => $bac_cvterm->cvterm_id, | |
| residues => $gb_seq->seq, | |
| seqlen => $gb_seq->length, | |
| }, | |
| ); | |
| $new_feature->create_featureprops({ htgs_phase => $htgs_phase, | |
| finished_seq => ($htgs_phase == 3 ? 1 : 0), | |
| sequenced_by => $project_name, | |
| project_country => $project_country, | |
| description => "genbank_gi:$gi genbank_accession:$gbacc sequenced_by:$project_name project_country:$project_country htgs_phase:$htgs_phase", | |
| }, | |
| { | |
| autocreate => 1 }, | |
| ); | |
| # add a dbxref for its genbank accession | |
| my $gbacc_dbx = $chado->resultset('General::Db') | |
| ->find_or_create({ name => 'DB:GenBank_Accession'}) | |
| ->find_or_create_related('dbxrefs', | |
| { accession => $gbacc, | |
| version => $seq->version, | |
| } | |
| ); | |
| $new_feature->add_to_secondary_dbxrefs( $gbacc_dbx ); | |
| # add a dbxref for its genbank GI | |
| my $gi_dbx = $chado->resultset('General::Db') | |
| ->find_or_create({ name => 'DB:GenBank_GI'}) | |
| ->find_or_create_related('dbxrefs', | |
| { accession => $gi, | |
| } | |
| ); | |
| $new_feature->add_to_secondary_dbxrefs( $gi_dbx ); | |
| # manually update the clone_feature table | |
| $chado->storage->dbh_do(sub { | |
| my ($s,$dbh) = @_; | |
| $dbh->do('delete from public.clone_feature where clone_id = ?', | |
| undef, | |
| $clone->clone_id, | |
| ); | |
| $dbh->do('insert into public.clone_feature (feature_id,clone_id) values (?,?)', | |
| undef, | |
| $new_feature->feature_id, | |
| $clone->clone_id, | |
| ); | |
| }); | |
| # don't actually do anything to the db if debug is on | |
| $chado->storage->txn_rollback if $d->get_debug; | |
| }); | |
| #print Dumper $seq; | |
| #die unless $clone_name; | |
| # print join "\t", $seq->id, $clone_name || 'UNKNOWN', $seq->accession_number, $seq->version; | |
| # print "\n"; | |
| $count++; | |
| # for each seq, we need to update: | |
| # - mapping between gb accession and bac name | |
| # - bac sequence | |
| } | |
| $class->vsay( "loaded $count seqs\n" ); | |
| } | |
| sub _infer_project_country_from_gb_richseq { | |
| my $seq = shift; | |
| my @project_signatures = @_; | |
| unless( @project_signatures ) { | |
| @project_signatures = | |
| ( | |
| [ IE => qr/Ireland/i ], | |
| [ CN => qr/China/, qr/Qu,D/, qr/Du,Y/, qr/He,J/, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment