Skip to content

Instantly share code, notes, and snippets.

@dbolser
Created August 8, 2013 08:31
Show Gist options
  • Save dbolser/6182743 to your computer and use it in GitHub Desktop.
Save dbolser/6182743 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
$| = 1;
use strict;
use warnings;
use autodie;
use feature 'say';
use Class::Load 'load_class';
use Cwd 'cwd';
use Data::Dump 'dump';
use File::Spec::Functions;
use File::Path 'mkpath';
use Getopt::Long;
use Grm::Config;
use Grm::Utils qw(commify timer_calc);
use Pod::Usage;
use Readonly;
my $list_species = 0;
my $out_dir = cwd();
my $selected_species = '';
my ( $help, $man_page );
GetOptions(
'l|list' => \$list_species,
's|species:s' => \$selected_species,
'o|out:s' => \$out_dir,
'help' => \$help,
'man' => \$man_page,
) or pod2usage(2);
if ( $help || $man_page ) {
pod2usage({
-exitval => 0,
-verbose => $man_page ? 2 : 1
});
};
my $timer = timer_calc();
my $conf = Grm::Config->new;
my $ens_conf = $conf->get('ensembl');
my $reg_file = $ens_conf->{'registry_file'} or die 'No registry file';
my $install_dir = $ens_conf->{'install_dir'} or die 'No install dir';
if ( $reg_file && $install_dir && -e $reg_file && -d $install_dir ) {
my %inc = map { $_, 1 } @INC;
for my $dir ( qw{
conf
ensembl-compara/modules
ensembl-draw/modules
ensembl-external/modules
ensembl-variation/modules
ensembl/modules
modules
} ) {
my $path = catdir( $install_dir, $dir );
if ( !$inc{ $path } ) {
push @INC, $path;
}
}
}
$out_dir = canonpath( $out_dir );
if ( !-d $out_dir ) {
mkpath( $out_dir );
}
my $reg_class = 'Bio::EnsEMBL::Registry';
load_class( $reg_class ) or die "Cannot load '$reg_class'";
$reg_class->load_all( $reg_file );
my %valid_db_adapators =
map { lc $_->species, $_ }
grep { $_->group eq 'core'}
@{ $reg_class->get_all_DBAdaptors() }
;
if ( $list_species ) {
printf "Valid species:\n%s\n",
join( "\n", map { " - $_" } sort keys %valid_db_adapators ),
;
exit 0;
}
my %selected_species =
map { s/ /_/g; lc $_, 1 }
split(/\s*,\s*/, $selected_species);
if ( %selected_species ) {
if (
my @bad = grep { !$valid_db_adapators{ $_ } } keys %selected_species
) {
printf "Bad species:\n%s\n", join( "\n", map { " - $_" } @bad );
exit 0;
}
}
elsif ( defined $selected_species{'all'} || !%selected_species ) {
%selected_species = %valid_db_adapators;
}
printf "Exporting %s species to '%s'\n",
commify(scalar keys %selected_species), $out_dir;
my @out_flds = qw[
ensembl_id
ensembl_object_type
db_display_name
linkage_annotation
dbname
release
info_type
info_text
type
version
secondary_db_name
secondary_db_table
description
];
my $num_species = 0;
for my $species ( sort keys %valid_db_adapators ) {
my $species_timer = timer_calc();
my $db = $valid_db_adapators{ $species };
if ( %selected_species ) {
next unless defined $selected_species{ lc $species };
}
my $gene_adaptor = $db->get_GeneAdaptor();
my @genes = @{ $gene_adaptor->fetch_all };
my $total_num_genes = scalar @genes;
printf "%3s: Processing %s gene%s from %s\n",
++$num_species,
commify($total_num_genes),
$total_num_genes == 1 ? '' : 's',
$species,
;
my $out = catfile( $out_dir, "${species}-xrefs.tab" );
open my $fh, '>', $out;
print $fh join( "\t", @out_flds, 'synonyms' ), "\n";
my ( $num_gene, $num_xref ) = ( 0, 0 );
for my $gene ( @genes ) {
my @xrefs = @{ $gene->get_all_object_xrefs };
$num_gene++;
printf "%-78s\r", sprintf('%11s (%3s%%): %s has %s xrefs',
commify($num_gene),
int(($num_gene/$total_num_genes) * 100),
$gene->display_id,
commify(scalar @xrefs)
);
for my $xref ( @xrefs ) {
print $fh join( "\t",
( map { $xref->$_() // '' } @out_flds ),
join(', ', @{ $xref->get_all_synonyms }),
), "\n";
$num_xref++;
}
}
close $fh;
printf "\r => Exported %s xrefs for %s genes in %s\n",
commify($num_xref),
commify($num_gene),
$species_timer->(),
;
}
printf "Exported %s species in %s to %s\n",
commify($num_species),
$timer->(),
$out_dir,
;
__END__
# ----------------------------------------------------
=pod
=head1 NAME
extract-ensembl-xrefs.pl - exports Ensembl xrefs
=head1 SYNOPSIS
extract-ensembl-xrefs.pl
Options:
-l|--list Show a list of valid species
-s|--species Comma-separated list of species, default is all
-o|--out Directory to place output files, default is '.'
--help Show brief help and exit
--man Show full documentation
=head1 DESCRIPTION
Describe what the script does, what input it expects, what output it
creates, etc.
=head1 AUTHOR
Ken Youens-Clark E<lt>[email protected]<gt>.
=head1 COPYRIGHT
Copyright (c) 2013 Cold Spring Harbor Laboratory
This module is free software; you can redistribute it and/or
modify it under the terms of the GPL (either version 1, or at
your option, any later version) or the Artistic License 2.0.
Refer to LICENSE for the full license text and to DISCLAIMER for
additional warranty disclaimers.
=cut
@dbolser
Copy link
Author

dbolser commented Aug 8, 2013

This code boils down to running 'Bio::EnsEMBL::Gene->get_all_object_xrefs' [1] on each gene for each species.

[1] http://www.ensembl.org/info/docs/Doxygen/core-api/classBio_1_1EnsEMBL_1_1Gene.html#a8d9c50953e4b8125f641fe691b83c160

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment