Created
June 14, 2013 03:30
-
-
Save standage/5779266 to your computer and use it in GitHub Desktop.
Given a set of transcript or protein alignments in GeneSeqer or GenomeThreader format, retrieve all sequences that map to the specified genomic region.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use strict; | |
| use Bio::SeqIO; | |
| use Getopt::Long; | |
| # Given a set of transcript or protein alignments in GeneSeqer or GenomeThreader | |
| # format, retrieve all sequences that map to the specified genomic region | |
| # | |
| # Example: perl gsq-select.pl Chr1:100001-200000 tair-ests.fa est-alignments.gsq | |
| my $usage = "perl $0 seqid:start-end seqs.fa aligns1.gsq [aligns2.gsq ...]"; | |
| my $grange = shift(@ARGV) or die("Usage: $usage"); | |
| my $seqfile = shift(@ARGV) or die("Usage: $usage"); | |
| my @gsqfiles = @ARGV or die("Usage: $usage"); | |
| my($seqid, $rangestart, $rangeend) = $grange =~ m/(.+):(\d+)-(\d+)/; | |
| my %alignedseqs; | |
| foreach my $gsqfile(@gsqfiles) | |
| { | |
| open(my $GSQ, "<", $gsqfile) or die("error opening GSQ/GTH file '$gsqfile' $!"); | |
| while(my $line = <$GSQ>) | |
| { | |
| if($line =~ m/PGS_$seqid[+-]_(.+)[+-]\s+\((\d+).+(\d+)\)/) | |
| { | |
| my $alignseqid = $1; | |
| my $alignstart = $2; | |
| my $alignend = $3; | |
| ($alignstart, $alignend) = ($3, $2) if($alignstart > $alignend); | |
| my $startoverlap =($alignstart >= $rangestart and $alignstart <= $rangeend); | |
| my $endoverlap =($alignend >= $rangestart and $alignend <= $rangeend); | |
| $alignedseqs{ $alignseqid } = 1 if($startoverlap or $endoverlap); | |
| } | |
| } | |
| close($GSQ); | |
| } | |
| my $sequences = Bio::SeqIO->new("-file" => $seqfile, "-format" => "Fasta"); | |
| my $outseqs = Bio::SeqIO->new("-fh" => \*STDOUT, "-format" => "Fasta"); | |
| while(my $seq = $sequences->next_seq()) | |
| { | |
| my $id = $seq->id; | |
| if($id =~ m/(lcl|gi)\|([^|]+)/) | |
| { | |
| $id = $2; | |
| } | |
| $outseqs->write_seq($seq) if($alignedseqs{ $id }); | |
| } | |
| $sequences->close(); | |
| $outseqs->close(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment