Skip to content

Instantly share code, notes, and snippets.

@zhoujj2013
Last active August 29, 2015 14:03
Show Gist options
  • Save zhoujj2013/a0b71b9c69cd41358f4b to your computer and use it in GitHub Desktop.
Save zhoujj2013/a0b71b9c69cd41358f4b to your computer and use it in GitHub Desktop.
extract_multialign_block from t-coffee fasta msa result
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my ($file, $max_gap, $min_block) = @ARGV;
my %fa;
my @id;
my $len = 0;
my %block_start;
open IN,"$file" || die $!;
$/ = ">"; <IN>; $/ = "\n";
while(<IN>){
chomp;
my $id = $1 if(/(\S+)/);
push @id, $id;
$/ = ">";
my $seq = <IN>;
chomp($seq);
$seq =~ s/\n//g;
$len = length($seq);
$/ = "\n";
my @seq = split //,$seq;
$fa{$id} = \@seq;
$block_start{$id} = 0;
}
close IN;
#print Dumper(\%fa);
my $block_num = 0;
my @block;
my @sub_block;
my $flag = 0;
foreach my $i (0 .. $len-1){
my @pos;
foreach my $id (@id){
push @pos,$fa{$id}[$i];
unless($fa{$id}[$i] eq "-"){
$block_start{$id}++;
}
}
my $r = check_same(\@pos);
if($r == 0){
$flag++;
push @sub_block,\@pos;
}else{
if($flag >= $max_gap && scalar(@block) > 0){
#print Dumper(\@block);
if(scalar(@block) > $min_block){
print ">block_$block_num\_start\n";
my $k = 0;
foreach my $name (@id){
my $name_len = 0;
my $name_len_sub = 0;
foreach my $p (@block){
$name_len++ if($p->[$k] =~ /[ATGCatgc]/);
}
foreach my $p (@sub_block){
$name_len_sub++ if($p->[$k] =~ /[ATGCatgc]/);
}
$k++;
my $t_len = $block_start{$name} - $name_len - $name_len_sub;
print "$name\t$t_len\t$name_len\n";
}
print ">block_$block_num\n";
$block_num++;
}
my $j = 0;
foreach my $id (@id){
if(scalar(@block) > $min_block){
print "$id\t";
}
my @str;
foreach my $b (@block){
#print Dumper($b);
push @str,$b->[$j];
}
if(scalar(@block) > $min_block){
my $str = join "",@str;
#$str =~ s/-//g;
print $str;
print "\n";
}
$j++;
}
$flag = 0;
@block = ();
push @block,\@pos;
@sub_block = ();
}elsif($flag > 0 && $flag < $max_gap){
push @block, @sub_block;
push @block,\@pos;
@sub_block = ();
$flag = 0;
}else{
push @block,\@pos;
}
}
}
if($flag < $max_gap && scalar(@block) > $min_block){
print ">block_$block_num\_start\n";
my $k = 0;
foreach my $name (@id){
my $name_len = 0;
my $name_len_sub = 0;
foreach my $p (@block){
$name_len++ if($p->[$k] =~ /[ATGCatgc]/);
}
foreach my $p (@sub_block){
$name_len_sub++ if($p->[$k] =~ /[ATGCatgc]/);
}
$k++;
my $t_len = $block_start{$name} - $name_len - $name_len_sub;
print "$name\t$t_len\t$name_len\n";
}
print ">block_$block_num\n";
my $j = 0;
foreach my $id (@id){
print "$id\t";
my @str;
foreach my $b (@block){
push @str,$b->[$j];
}
my $str = join "",@str;
#$str =~ s/-//g;
print "$str";
print "\n";
$j++;
}
}
sub check_same{
my ($arr) = @_;
my $flag = 1;
foreach my $nt (@{$arr}){
unless($nt =~ /[ATGCatgc]/){
$flag = 0;
}
}
return $flag;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment