-
-
Save andrewyatz/831981 to your computer and use it in GitHub Desktop.
#!/bin/env perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use Pod::Usage; | |
use Bio::EnsEMBL::Registry; | |
use Bio::EnsEMBL::Utils::Scalar qw(wrap_array); | |
use IO::String; | |
my $O = options(); | |
my $DBA = database(); | |
my $MLSS = mlss(); | |
my $SYNTENY = synteny(); | |
my $FH = fh(); | |
print_synteny(); | |
if(!$O->{file}) { | |
print ${$FH->string_ref}; | |
} | |
close($FH); | |
exit 0; | |
sub options { | |
my $opts = {}; | |
my @flags = qw( | |
ensembl ensemblgenomes database=s species=s@ version=i file=s help man | |
); | |
GetOptions($opts, @flags) or pod2usage(1); | |
_exit( undef, 0, 1) if $opts->{help}; | |
_exit( undef, 0, 2) if $opts->{man}; | |
if($opts->{ensemblgenomes} && $opts->{ensembl}) { | |
_exit( 'Can only specify -ensembl or -ensemblgenomes', 1, 2); | |
} | |
my $species = wrap_array($opts->{species}); | |
if(scalar(@{$species}) != 2) { | |
_exit( 'Script requires two -species attributes', 1, 2); | |
} | |
$opts->{database} = 'multi' unless $opts->{database}; | |
return $opts; | |
} | |
sub database { | |
my %args; | |
if($O->{ensemblgenomes}) { | |
%args = ( | |
-HOST => 'mysql.ebi.ac.uk', -PORT => 4157, -USER => 'anonymous' | |
); | |
} | |
elsif($O->{ensembl}) { | |
%args = ( | |
-HOST => 'ensembldb.ensembl.org', -PORT => 5306, -USER => 'anonymous' | |
); | |
} | |
else { | |
_exit('Not aware of which server I should connect to. Specify -ensembl or -ensemblgenomes', 2, 1); | |
} | |
if($O->{version}) { | |
$args{-DB_VERSION} = $O->{version}; | |
} | |
Bio::EnsEMBL::Registry->load_registry_from_db(%args); | |
my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($O->{database}, 'compara'); | |
if(! defined $dba) { | |
_exit($O->{database}.' did not result in a valid DB. Try again specifying a different -database parameter. Also check your -version switch and API version', 2, 1); | |
} | |
return $dba; | |
} | |
sub mlss { | |
my $species = $O->{species}; | |
my $gdba = $DBA->get_GenomeDBAdaptor(); | |
my $mlssa = $DBA->get_MethodLinkSpeciesSetAdaptor(); | |
my @gdbs = map { $gdba->fetch_by_registry_name($_) } @{$species}; | |
my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs('SYNTENY', \@gdbs); | |
return $mlss; | |
} | |
sub synteny { | |
my $sra = $DBA->get_SyntenyRegionAdaptor(); | |
my $regions = $sra->fetch_all_by_MethodLinkSpeciesSet($MLSS); | |
return $regions; | |
} | |
sub fh { | |
if($O->{file}) { | |
return IO::File->new($O->{file}, 'w'); | |
} | |
return IO::String->new(); | |
} | |
sub print_synteny { | |
my ($source_gdb, $target_gdb) = @{$MLSS->species_set()->genome_dbs()}; | |
my $sname = $source_gdb->name(); | |
my $tname = $target_gdb->name(); | |
print $FH sprintf( | |
'track name=synteny_%1$s description="%1$s %2$s synteny projections"', | |
$sname, $tname); | |
print $FH "\n"; | |
foreach my $s (@{$SYNTENY}) { | |
my $regions = $s->get_all_DnaFragRegions(); | |
my ($source) = grep { $_->genome_db()->dbID() eq $source_gdb->dbID() } @{$regions}; | |
my ($target) = grep { $_->genome_db()->dbID() eq $target_gdb->dbID() } @{$regions}; | |
my $create_elements = sub { | |
my ($frag) = @_; | |
return ( | |
$frag->dnafrag()->name(), #name of region | |
$frag->dnafrag()->length(), #length of region e.g. chr1 in human | |
$frag->dnafrag_start(), #start of alignment | |
$frag->dnafrag_end() #end of alignment | |
); | |
}; | |
my @data = ( | |
$source->length(), | |
0, #misMatches | |
0, #repMatches | |
0, #nCount | |
0, #qNumInsert | |
0, #qBaseInsert | |
0, #tNumInsert | |
0, #tBaseInsert | |
( $source->dnafrag_strand() ) ? '+' : '-', #strand, | |
$create_elements->($source), #query info | |
$create_elements->($target), #target info | |
1, # number of blocks | |
$source->length(), #block sizes | |
$source->dnafrag_start(), #query start blocks | |
$target->dnafrag_start() #target start blocks | |
); | |
print $FH join(q{ }, @data), "\n"; | |
} | |
return; | |
} | |
sub _exit { | |
my ($msg, $status, $verbose) = @_; | |
print STDERR $msg, "\n" if defined $msg; | |
pod2usage( -exitstatus => $status, -verbose => $verbose ); | |
} | |
__END__ | |
=pod | |
=head1 NAME | |
list_synteny.pl | |
=head1 SYNOPSIS | |
./list_synteny.pl -species human -species mouse -ensembl -database multi [see opts] [ --help | --man ] | |
=head1 DESCRIPTION | |
Returns a PSL to screen or to a specified file location | |
=head1 OPTIONS | |
=over 8 | |
=item B<--species> | |
Accepts 2 invocations to specify the species to look for synteny between | |
=item B<--ensembl> | |
Specify to connect to the public ensembl database | |
=item B<--ensemblgenomes> | |
Specify to connect to the public ensembl genomes database | |
=item B<--database> | |
Specify the name of the database to connect to. If going to ensembl | |
use multi otherwise if connecting to ensembl genomes use one like | |
plants, metazoa, protists, bacteria or fungi (not all DBs have synteny | |
projections though). | |
=item B<--version> | |
Optional integer which can be used to force the databases we load. Useful if | |
your API does not match the DB you are connecting to. | |
=item B<--file> | |
Location of the file to write to. If not specified will write to STDOUT | |
=back | |
=head1 VERSION | |
1 | |
=head1 LICENSE | |
Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
Genome Research Limited. All rights reserved. | |
This software is distributed under a modified Apache license. | |
For license details, please see | |
http://www.ensembl.org/info/about/code_licence.html | |
=cut |
Unfortunately, it doesn't work for me. I get an error 'Not an ARRAY reference at list_synteny.pl line 105.'. When I change the array reference to hash (changing @ to %), I get further errors 'Can't locate object method "name" via package "adaptor" (perhaps you forgot to load "adaptor"'
I'm also aware that there are getsynteny.pl script under this link https://github.com/Ensembl/ensembl-compara/tree/release/107/scripts/synteny, but I'm struggling on how to use it correctly as well.
Do you mind helping me out with those? I really need this information for my undergrad thesis.
I've just updated this gist. The error was because I believe lots of years ago ->species_set()
would return an array, where as now it returns an object. Changing line 105's dereferencing to @{$MLSS->species_set()->genome_dbs()}
has fixed the issue. Verified by trying to extract human/mouse synteny.
Thank you so much for doing this!!
Last question: What version of perl are you using? I still get an issue in the line 109 saying "Can't use an undefined value as a symbol reference at list_synteny.pl"
Mine is following: This is perl 5, version 32, subversion 0 (v5.32.0) built for darwin-2level
I'm running This is perl 5, version 34, subversion 1 (v5.34.1) built for darwin-2level
and do not see any issue on my side sorry. The command I ran was: perl list_synteny.pl -species human -species mouse -ensembl
.
With respect to the error, this happens when a filehandle becomes undefined. You can see from the code that we initialise $FH
on line 16 before trying to print to it in print_synteny
. Are you working with a clean version of the script or a modified version? This certainly should work. Also try updating IO::String
and IO::File
modules in-case there is an issue with them
Okay, so I had to specify use IO::File
and get rid of line 23 close($FH);
, and now it works!! Thanks a lot, Andrew!
From a quick look I can only say is possibly. Still looks sensible. Please give it a try