Created
August 8, 2013 08:31
-
-
Save dbolser/6182743 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
$| = 1; | |
use strict; | |
use warnings; | |
use autodie; | |
use feature 'say'; | |
use Class::Load 'load_class'; | |
use Cwd 'cwd'; | |
use Data::Dump 'dump'; | |
use File::Spec::Functions; | |
use File::Path 'mkpath'; | |
use Getopt::Long; | |
use Grm::Config; | |
use Grm::Utils qw(commify timer_calc); | |
use Pod::Usage; | |
use Readonly; | |
my $list_species = 0; | |
my $out_dir = cwd(); | |
my $selected_species = ''; | |
my ( $help, $man_page ); | |
GetOptions( | |
'l|list' => \$list_species, | |
's|species:s' => \$selected_species, | |
'o|out:s' => \$out_dir, | |
'help' => \$help, | |
'man' => \$man_page, | |
) or pod2usage(2); | |
if ( $help || $man_page ) { | |
pod2usage({ | |
-exitval => 0, | |
-verbose => $man_page ? 2 : 1 | |
}); | |
}; | |
my $timer = timer_calc(); | |
my $conf = Grm::Config->new; | |
my $ens_conf = $conf->get('ensembl'); | |
my $reg_file = $ens_conf->{'registry_file'} or die 'No registry file'; | |
my $install_dir = $ens_conf->{'install_dir'} or die 'No install dir'; | |
if ( $reg_file && $install_dir && -e $reg_file && -d $install_dir ) { | |
my %inc = map { $_, 1 } @INC; | |
for my $dir ( qw{ | |
conf | |
ensembl-compara/modules | |
ensembl-draw/modules | |
ensembl-external/modules | |
ensembl-variation/modules | |
ensembl/modules | |
modules | |
} ) { | |
my $path = catdir( $install_dir, $dir ); | |
if ( !$inc{ $path } ) { | |
push @INC, $path; | |
} | |
} | |
} | |
$out_dir = canonpath( $out_dir ); | |
if ( !-d $out_dir ) { | |
mkpath( $out_dir ); | |
} | |
my $reg_class = 'Bio::EnsEMBL::Registry'; | |
load_class( $reg_class ) or die "Cannot load '$reg_class'"; | |
$reg_class->load_all( $reg_file ); | |
my %valid_db_adapators = | |
map { lc $_->species, $_ } | |
grep { $_->group eq 'core'} | |
@{ $reg_class->get_all_DBAdaptors() } | |
; | |
if ( $list_species ) { | |
printf "Valid species:\n%s\n", | |
join( "\n", map { " - $_" } sort keys %valid_db_adapators ), | |
; | |
exit 0; | |
} | |
my %selected_species = | |
map { s/ /_/g; lc $_, 1 } | |
split(/\s*,\s*/, $selected_species); | |
if ( %selected_species ) { | |
if ( | |
my @bad = grep { !$valid_db_adapators{ $_ } } keys %selected_species | |
) { | |
printf "Bad species:\n%s\n", join( "\n", map { " - $_" } @bad ); | |
exit 0; | |
} | |
} | |
elsif ( defined $selected_species{'all'} || !%selected_species ) { | |
%selected_species = %valid_db_adapators; | |
} | |
printf "Exporting %s species to '%s'\n", | |
commify(scalar keys %selected_species), $out_dir; | |
my @out_flds = qw[ | |
ensembl_id | |
ensembl_object_type | |
db_display_name | |
linkage_annotation | |
dbname | |
release | |
info_type | |
info_text | |
type | |
version | |
secondary_db_name | |
secondary_db_table | |
description | |
]; | |
my $num_species = 0; | |
for my $species ( sort keys %valid_db_adapators ) { | |
my $species_timer = timer_calc(); | |
my $db = $valid_db_adapators{ $species }; | |
if ( %selected_species ) { | |
next unless defined $selected_species{ lc $species }; | |
} | |
my $gene_adaptor = $db->get_GeneAdaptor(); | |
my @genes = @{ $gene_adaptor->fetch_all }; | |
my $total_num_genes = scalar @genes; | |
printf "%3s: Processing %s gene%s from %s\n", | |
++$num_species, | |
commify($total_num_genes), | |
$total_num_genes == 1 ? '' : 's', | |
$species, | |
; | |
my $out = catfile( $out_dir, "${species}-xrefs.tab" ); | |
open my $fh, '>', $out; | |
print $fh join( "\t", @out_flds, 'synonyms' ), "\n"; | |
my ( $num_gene, $num_xref ) = ( 0, 0 ); | |
for my $gene ( @genes ) { | |
my @xrefs = @{ $gene->get_all_object_xrefs }; | |
$num_gene++; | |
printf "%-78s\r", sprintf('%11s (%3s%%): %s has %s xrefs', | |
commify($num_gene), | |
int(($num_gene/$total_num_genes) * 100), | |
$gene->display_id, | |
commify(scalar @xrefs) | |
); | |
for my $xref ( @xrefs ) { | |
print $fh join( "\t", | |
( map { $xref->$_() // '' } @out_flds ), | |
join(', ', @{ $xref->get_all_synonyms }), | |
), "\n"; | |
$num_xref++; | |
} | |
} | |
close $fh; | |
printf "\r => Exported %s xrefs for %s genes in %s\n", | |
commify($num_xref), | |
commify($num_gene), | |
$species_timer->(), | |
; | |
} | |
printf "Exported %s species in %s to %s\n", | |
commify($num_species), | |
$timer->(), | |
$out_dir, | |
; | |
__END__ | |
# ---------------------------------------------------- | |
=pod | |
=head1 NAME | |
extract-ensembl-xrefs.pl - exports Ensembl xrefs | |
=head1 SYNOPSIS | |
extract-ensembl-xrefs.pl | |
Options: | |
-l|--list Show a list of valid species | |
-s|--species Comma-separated list of species, default is all | |
-o|--out Directory to place output files, default is '.' | |
--help Show brief help and exit | |
--man Show full documentation | |
=head1 DESCRIPTION | |
Describe what the script does, what input it expects, what output it | |
creates, etc. | |
=head1 AUTHOR | |
Ken Youens-Clark E<lt>[email protected]<gt>. | |
=head1 COPYRIGHT | |
Copyright (c) 2013 Cold Spring Harbor Laboratory | |
This module is free software; you can redistribute it and/or | |
modify it under the terms of the GPL (either version 1, or at | |
your option, any later version) or the Artistic License 2.0. | |
Refer to LICENSE for the full license text and to DISCLAIMER for | |
additional warranty disclaimers. | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This code boils down to running 'Bio::EnsEMBL::Gene->get_all_object_xrefs' [1] on each gene for each species.
[1] http://www.ensembl.org/info/docs/Doxygen/core-api/classBio_1_1EnsEMBL_1_1Gene.html#a8d9c50953e4b8125f641fe691b83c160