Skip to content

Instantly share code, notes, and snippets.

@standage
Created June 14, 2013 03:30
Show Gist options
  • Select an option

  • Save standage/5779266 to your computer and use it in GitHub Desktop.

Select an option

Save standage/5779266 to your computer and use it in GitHub Desktop.
Given a set of transcript or protein alignments in GeneSeqer or GenomeThreader format, retrieve all sequences that map to the specified genomic region.
#!/usr/bin/env perl
use strict;
use Bio::SeqIO;
use Getopt::Long;
# Given a set of transcript or protein alignments in GeneSeqer or GenomeThreader
# format, retrieve all sequences that map to the specified genomic region
#
# Example: perl gsq-select.pl Chr1:100001-200000 tair-ests.fa est-alignments.gsq
my $usage = "perl $0 seqid:start-end seqs.fa aligns1.gsq [aligns2.gsq ...]";
my $grange = shift(@ARGV) or die("Usage: $usage");
my $seqfile = shift(@ARGV) or die("Usage: $usage");
my @gsqfiles = @ARGV or die("Usage: $usage");
my($seqid, $rangestart, $rangeend) = $grange =~ m/(.+):(\d+)-(\d+)/;
my %alignedseqs;
foreach my $gsqfile(@gsqfiles)
{
open(my $GSQ, "<", $gsqfile) or die("error opening GSQ/GTH file '$gsqfile' $!");
while(my $line = <$GSQ>)
{
if($line =~ m/PGS_$seqid[+-]_(.+)[+-]\s+\((\d+).+(\d+)\)/)
{
my $alignseqid = $1;
my $alignstart = $2;
my $alignend = $3;
($alignstart, $alignend) = ($3, $2) if($alignstart > $alignend);
my $startoverlap =($alignstart >= $rangestart and $alignstart <= $rangeend);
my $endoverlap =($alignend >= $rangestart and $alignend <= $rangeend);
$alignedseqs{ $alignseqid } = 1 if($startoverlap or $endoverlap);
}
}
close($GSQ);
}
my $sequences = Bio::SeqIO->new("-file" => $seqfile, "-format" => "Fasta");
my $outseqs = Bio::SeqIO->new("-fh" => \*STDOUT, "-format" => "Fasta");
while(my $seq = $sequences->next_seq())
{
my $id = $seq->id;
if($id =~ m/(lcl|gi)\|([^|]+)/)
{
$id = $2;
}
$outseqs->write_seq($seq) if($alignedseqs{ $id });
}
$sequences->close();
$outseqs->close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment