dbolser · August 8, 2013 08:31 · dbolser · Aug 8, 2013
diff --git a/gistfile1.pl b/gistfile1.pl
 #!/usr/bin/env perl

 $| = 1;

 use strict;
 use warnings;
 use autodie;
 use feature 'say';
 use Class::Load 'load_class';
 use Cwd 'cwd';
 use Data::Dump 'dump';
 use File::Spec::Functions;
 use File::Path 'mkpath';
 use Getopt::Long;
 use Grm::Config;
 use Grm::Utils qw(commify timer_calc);
 use Pod::Usage;
 use Readonly;

 my $list_species     = 0;
 my $out_dir          = cwd();
 my $selected_species = '';
 my ( $help, $man_page );
 GetOptions(
    'l|list'      => \$list_species,
    's|species:s' => \$selected_species,
    'o|out:s'     => \$out_dir,
    'help'        => \$help,
    'man'         => \$man_page,
 ) or pod2usage(2);

 if ( $help || $man_page ) {
    pod2usage({
        -exitval => 0,
        -verbose => $man_page ? 2 : 1
    });
 }; 

 my $timer       = timer_calc();
 my $conf        = Grm::Config->new;
 my $ens_conf    = $conf->get('ensembl');
 my $reg_file    = $ens_conf->{'registry_file'} or die 'No registry file';
 my $install_dir = $ens_conf->{'install_dir'}   or die 'No install dir';

 if ( $reg_file && $install_dir && -e $reg_file && -d $install_dir ) {
    my %inc = map { $_, 1 } @INC;
    for my $dir ( qw{
        conf
        ensembl-compara/modules
        ensembl-draw/modules
        ensembl-external/modules
        ensembl-variation/modules
        ensembl/modules
        modules
    } ) {
        my $path = catdir( $install_dir, $dir );

        if ( !$inc{ $path } ) {
            push @INC, $path;
        }
    }
 }

 $out_dir = canonpath( $out_dir );
 if ( !-d $out_dir ) {
    mkpath( $out_dir );
 }

 my $reg_class = 'Bio::EnsEMBL::Registry';
 load_class( $reg_class ) or die "Cannot load '$reg_class'";
 $reg_class->load_all( $reg_file );

 my %valid_db_adapators = 
    map { lc $_->species, $_ } 
    grep { $_->group eq 'core'}
    @{ $reg_class->get_all_DBAdaptors() }
 ;

 if ( $list_species ) {
    printf "Valid species:\n%s\n", 
        join( "\n", map { " - $_" } sort keys %valid_db_adapators ),
    ;
    exit 0;
 }

 my %selected_species = 
    map { s/ /_/g; lc $_, 1 }
    split(/\s*,\s*/, $selected_species);

 if ( %selected_species ) {
    if ( 
        my @bad = grep { !$valid_db_adapators{ $_ } } keys %selected_species 
    ) {
        printf "Bad species:\n%s\n", join( "\n", map { " - $_" } @bad );
        exit 0;
    }
 }
 elsif ( defined $selected_species{'all'} || !%selected_species ) {
    %selected_species = %valid_db_adapators;
 }

 printf "Exporting %s species to '%s'\n", 
    commify(scalar keys %selected_species), $out_dir;

 my @out_flds = qw[ 
    ensembl_id
    ensembl_object_type 
    db_display_name 
    linkage_annotation 
    dbname 
    release 
    info_type 
    info_text 
    type 
    version 
    secondary_db_name 
    secondary_db_table 
    description 
 ];

 my $num_species = 0;
 for my $species ( sort keys %valid_db_adapators ) {
    my $species_timer = timer_calc();
    my $db = $valid_db_adapators{ $species };

    if ( %selected_species ) {
        next unless defined $selected_species{ lc $species };
    }

    my $gene_adaptor    = $db->get_GeneAdaptor();
    my @genes           = @{ $gene_adaptor->fetch_all };
    my $total_num_genes = scalar @genes;

    printf "%3s: Processing %s gene%s from %s\n", 
        ++$num_species, 
        commify($total_num_genes),
        $total_num_genes == 1 ? '' : 's',
        $species,
    ;

    my $out = catfile( $out_dir, "${species}-xrefs.tab" );
    open my $fh, '>', $out;

    print $fh join( "\t", @out_flds, 'synonyms' ), "\n";


    my ( $num_gene, $num_xref ) = ( 0, 0 );
    for my $gene ( @genes ) {
        my @xrefs = @{ $gene->get_all_object_xrefs };
        $num_gene++;

        printf "%-78s\r", sprintf('%11s (%3s%%): %s has %s xrefs',
            commify($num_gene), 
            int(($num_gene/$total_num_genes) * 100),
            $gene->display_id, 
            commify(scalar @xrefs)
        );

        for my $xref ( @xrefs ) {
            print $fh join( "\t", 
                ( map { $xref->$_() // '' } @out_flds ), 
                join(', ', @{ $xref->get_all_synonyms }),
            ), "\n";
            $num_xref++;
        }
    }

    close $fh;
    printf "\r     => Exported %s xrefs for %s genes in %s\n", 
        commify($num_xref), 
        commify($num_gene), 
        $species_timer->(),
    ;
 }

 printf "Exported %s species in %s to %s\n", 
    commify($num_species), 
    $timer->(),
    $out_dir,
 ;

 __END__

 # ----------------------------------------------------

 =pod

 =head1 NAME

 extract-ensembl-xrefs.pl - exports Ensembl xrefs

 =head1 SYNOPSIS

  extract-ensembl-xrefs.pl 

 Options:

  -l|--list     Show a list of valid species
  -s|--species  Comma-separated list of species, default is all
  -o|--out      Directory to place output files, default is '.'

  --help        Show brief help and exit
  --man         Show full documentation

 =head1 DESCRIPTION

 Describe what the script does, what input it expects, what output it
 creates, etc.

 =head1 AUTHOR

 Ken Youens-Clark E<lt>[email protected]<gt>.

 =head1 COPYRIGHT

 Copyright (c) 2013 Cold Spring Harbor Laboratory

 This module is free software; you can redistribute it and/or
 modify it under the terms of the GPL (either version 1, or at
 your option, any later version) or the Artistic License 2.0.
 Refer to LICENSE for the full license text and to DISCLAIMER for
 additional warranty disclaimers.

 =cut
	#!/usr/bin/env perl

	$\| = 1;

	use strict;
	use warnings;
	use autodie;
	use feature 'say';
	use Class::Load 'load_class';
	use Cwd 'cwd';
	use Data::Dump 'dump';
	use File::Spec::Functions;
	use File::Path 'mkpath';
	use Getopt::Long;
	use Grm::Config;
	use Grm::Utils qw(commify timer_calc);
	use Pod::Usage;
	use Readonly;

	my $list_species = 0;
	my $out_dir = cwd();
	my $selected_species = '';
	my ( $help, $man_page );
	GetOptions(
	'l\|list' => \$list_species,
	's\|species:s' => \$selected_species,
	'o\|out:s' => \$out_dir,
	'help' => \$help,
	'man' => \$man_page,
	) or pod2usage(2);

	if ( $help \|\| $man_page ) {
	pod2usage({
	-exitval => 0,
	-verbose => $man_page ? 2 : 1
	});
	};

	my $timer = timer_calc();
	my $conf = Grm::Config->new;
	my $ens_conf = $conf->get('ensembl');
	my $reg_file = $ens_conf->{'registry_file'} or die 'No registry file';
	my $install_dir = $ens_conf->{'install_dir'} or die 'No install dir';

	if ( $reg_file && $install_dir && -e $reg_file && -d $install_dir ) {
	my %inc = map { $_, 1 } @INC;
	for my $dir ( qw{
	conf
	ensembl-compara/modules
	ensembl-draw/modules
	ensembl-external/modules
	ensembl-variation/modules
	ensembl/modules
	modules
	} ) {
	my $path = catdir( $install_dir, $dir );

	if ( !$inc{ $path } ) {
	push @INC, $path;
	}
	}
	}

	$out_dir = canonpath( $out_dir );
	if ( !-d $out_dir ) {
	mkpath( $out_dir );
	}

	my $reg_class = 'Bio::EnsEMBL::Registry';
	load_class( $reg_class ) or die "Cannot load '$reg_class'";
	$reg_class->load_all( $reg_file );

	my %valid_db_adapators =
	map { lc $_->species, $_ }
	grep { $_->group eq 'core'}
	@{ $reg_class->get_all_DBAdaptors() }
	;

	if ( $list_species ) {
	printf "Valid species:\n%s\n",
	join( "\n", map { " - $_" } sort keys %valid_db_adapators ),
	;
	exit 0;
	}

	my %selected_species =
	map { s/ /_/g; lc $_, 1 }
	split(/\s,\s/, $selected_species);

	if ( %selected_species ) {
	if (
	my @bad = grep { !$valid_db_adapators{ $_ } } keys %selected_species
	) {
	printf "Bad species:\n%s\n", join( "\n", map { " - $_" } @bad );
	exit 0;
	}
	}
	elsif ( defined $selected_species{'all'} \|\| !%selected_species ) {
	%selected_species = %valid_db_adapators;
	}

	printf "Exporting %s species to '%s'\n",
	commify(scalar keys %selected_species), $out_dir;

	my @out_flds = qw[
	ensembl_id
	ensembl_object_type
	db_display_name
	linkage_annotation
	dbname
	release
	info_type
	info_text
	type
	version
	secondary_db_name
	secondary_db_table
	description
	];

	my $num_species = 0;
	for my $species ( sort keys %valid_db_adapators ) {
	my $species_timer = timer_calc();
	my $db = $valid_db_adapators{ $species };

	if ( %selected_species ) {
	next unless defined $selected_species{ lc $species };
	}

	my $gene_adaptor = $db->get_GeneAdaptor();
	my @genes = @{ $gene_adaptor->fetch_all };
	my $total_num_genes = scalar @genes;

	printf "%3s: Processing %s gene%s from %s\n",
	++$num_species,
	commify($total_num_genes),
	$total_num_genes == 1 ? '' : 's',
	$species,
	;

	my $out = catfile( $out_dir, "${species}-xrefs.tab" );
	open my $fh, '>', $out;

	print $fh join( "\t", @out_flds, 'synonyms' ), "\n";


	my ( $num_gene, $num_xref ) = ( 0, 0 );
	for my $gene ( @genes ) {
	my @xrefs = @{ $gene->get_all_object_xrefs };
	$num_gene++;

	printf "%-78s\r", sprintf('%11s (%3s%%): %s has %s xrefs',
	commify($num_gene),
	int(($num_gene/$total_num_genes) * 100),
	$gene->display_id,
	commify(scalar @xrefs)
	);

	for my $xref ( @xrefs ) {
	print $fh join( "\t",
	( map { $xref->$_() // '' } @out_flds ),
	join(', ', @{ $xref->get_all_synonyms }),
	), "\n";
	$num_xref++;
	}
	}

	close $fh;
	printf "\r => Exported %s xrefs for %s genes in %s\n",
	commify($num_xref),
	commify($num_gene),
	$species_timer->(),
	;
	}

	printf "Exported %s species in %s to %s\n",
	commify($num_species),
	$timer->(),
	$out_dir,
	;

	__END__

	# ----------------------------------------------------

	=pod

	=head1 NAME

	extract-ensembl-xrefs.pl - exports Ensembl xrefs

	=head1 SYNOPSIS

	extract-ensembl-xrefs.pl

	Options:

	-l\|--list Show a list of valid species
	-s\|--species Comma-separated list of species, default is all
	-o\|--out Directory to place output files, default is '.'

	--help Show brief help and exit
	--man Show full documentation

	=head1 DESCRIPTION

	Describe what the script does, what input it expects, what output it
	creates, etc.

	=head1 AUTHOR

	Ken Youens-Clark E<lt>[email protected]<gt>.

	=head1 COPYRIGHT

	Copyright (c) 2013 Cold Spring Harbor Laboratory

	This module is free software; you can redistribute it and/or
	modify it under the terms of the GPL (either version 1, or at
	your option, any later version) or the Artistic License 2.0.
	Refer to LICENSE for the full license text and to DISCLAIMER for
	additional warranty disclaimers.

	=cut