LeeBergstrand · August 28, 2024 02:27 · dburkhardt · May 29, 2015
diff --git a/all.hmm.ps.len b/all.hmm.ps.len
 Cyp125(GramPos)	408
diff --git a/DbCAN_HMMSCAN_Parser_Problem.md b/DbCAN_HMMSCAN_Parser_Problem.md
diff --git a/endperl.pl b/endperl.pl
 #!/usr/bin/env perl
 while(<>)
 {
 	@a=split(/\t/,$_);
 	print "\n======================================================================================================\n";
 	print "a[-2] = $a[-2] \n";
 	print "a[-1] = $a[-1]";
 	print "(a[-1]-a[-2]) = ".($a[-1]-$a[-2])." \n";
 	
 	if(($a[-1]-$a[-2])>80)
 	{
 		print "a[-1]-a[-2]>80 = TRUE \n\n";
 		if(not ($a[2]<1e-5))
 		{
 			print "Domain alignment should be deleted.\n";
 			print "E-Value >= 1e-5\n\n";
 		}
 	}
 	else 
 	{
 		print "a[-1]-a[-2]>80 = FALSE \n\n";
 		if(not ($a[2]<1e-3))
 		{
 			print "Domain alignment should be deleted.\n";
 			print "E-Value >= 1e-3\n\n";
 		}

 	}
 	
 	print "Prot.ID\t\tHMM.Model\t\tEValue\tHMM.From HMM.To\tAli.From Ali.To\tHMM.Cover\n";
 	print "-----------------------------------------------------------------------------------------------------\n";
 	if(($a[-1]-$a[-2])>80)
 	{
 		print $_ if $a[2]<1e-5;
 	}
 	else
 	{
 		print $_ if $a[2]<1e-3;
 	}
 }
diff --git a/endperl2.pl b/endperl2.pl
 #!/usr/bin/env perl
 while(<>)
 {
 	@a=split(/\t/,$_);
 	print "\n======================================================================================================\n";
 	print "a[-3] = $a[-3] \n";
 	print "a[-2] = $a[-2] \n";
 	print "(a[-2]-a[-3]) = ".($a[-2]-$a[-3])." \n";
 	
 	if(($a[-2]-$a[-3])>80)
 	{
 		print "a[-2]-a[-3]>80 = TRUE \n\n";
 		if(not ($a[2]<1e-5))
 		{
 			print "Domain alignment should be deleted.\n";
 			print "E-Value >= 1e-5\n\n";
 		}
 	}
 	else 
 	{
 		print "a[-2]-a[-3]>80 = FALSE \n\n";
 		if(not ($a[2]<1e-3))
 		{
 			print "Domain alignment should be deleted.\n";
 			print "E-Value >= 1e-3\n\n";
 		}
 		
 	}
 	
 	print "Prot.ID\t\tHMM.Model\t\tEValue\tHMM.From HMM.To\tAli.From Ali.To\tHMM.Cover\n";
 	print "-----------------------------------------------------------------------------------------------------\n";
 	if(($a[-2]-$a[-3])>80)
 	{
 		print $_ if $a[2]<1e-5;
 	}
 	else
 	{
 		print $_ if $a[2]<1e-3;
 	}
 }
diff --git a/Expanded-HMMscanParser.pl b/Expanded-HMMscanParser.pl
 # Expanded Perl (This perl does not work!)

 # cat $1 | perl -e 
 while(<>)
 {
 	if(/^\/\//)
 	{
 		$x=join("",@a);
 		($q)=($x=~/^Query:\s+(\S+)/m); # Grabs query name
 		
 		while($x=~/^>> (\S+.*?\n\n)/msg) # mutiline, "." matches newline, global
 		{
 			$a=$&;
 			@c=split(/\n/,$a);
 			$c[0]=~s/>> //;
 			for($i=3;$i<=$#c;$i++)
 			{
 				@d=split(/\s+/,$c[$i]);
 				print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;
 			}
 		}
 		@a=();
 	}
 	else
 	{
 		push(@a,$_);
 	}
 }

 #| sort -k 1,1 -k 6,6n -k 7,7n | uniq \| perl -e
 while(<>)
 {
 	chomp;
 	@a=split;
 	next if $a[-1]==$a[-2];
 		push(@{$b{$a[0]}},$_); # Removes lines with same start and end
 }

 foreach(sort keys %b) 
 {
        @a=@{$b{$_}};

        for($i = 0; $i < $#a; $i++) 
 		{
                @b = split(/\t/, $a[$i]);   # Alignment 1 = top alignment
                @c = split(/\t/, $a[$i+1]); # Alignment 2 = bottom alignment
                $len1 = $b[-1] - $b[-2]; # Length one = aligment 1 end - aligment 1 start
                $len2 = $c[-1] - $c[-2]; # Length two = aligment 2 end - aligment 2 start
                $len3 = $b[-1] - $c[-2]; # Length one = aligment 1 end - aligment 2 start

                if($len3 > 0 and ($len3 / $len1 > 0.5 or $len3 / $len2 > 0.5)) # if alignments are overlaped and the overlap is greater than 50% the length of an alignment. 
                {
                        if($b[2] < $c[2]) # Checks E value. Removes the alignment with the lowest Evalue.
                        {
                                splice(@a, $i + 1, 1);
                        }
                        else
                        {
                                splice(@a, $i, 1);
                        }
                        $i = $i - 1;
                }
        }
        foreach(@a) 
 		{
                print $_ . "\n";
        }
 }

 # | uniq | perl -e 
 open(IN,"all.hmm.ps.len");
 while(<IN>)
 {
 	chomp;
 	@a=split;
 	$b{$a[0]}=$a[1]; # creates hash of hmmName : hmmLength
 }

 while(<>)
 {
 	chomp;
 	@a=split;
 	$r=($a[4]-$a[3])/$b{$a[1]}; # $a[4] = hmm end $a[3] = hmm start ; $b{$a[1]} = result of the hash of the name of the hmm (hmm length).
 	print $_."\t".$r."\n";
 }
 	
 # | perl -e 
 while(<>)
 {
 	@a=split(/\t/,$_);
 	if(($a[-1]-$a[-2])>80)
 	{
 		print $_ if $a[2]<1e-5;
 	}
 	else
 	{
 		print $_ if $a[2]<1e-3;
 	}
 }
 # awk '$NF>0.3' # Deletes alignment coverages less than 1/3
diff --git a/hmmscan-parser.sh b/hmmscan-parser.sh
 #!/usr/bin/env sh

 # This is the original script and is still faulty! Do not use!

 # Yanbin Yin
 # 08/18/2011
 # hmmscan output parser
 # Usage: sh hmmscan-parser.sh hmmscan-output-file

 # 1. take hmmer3 output and generate the tabular output
 # 2. sort on the 6th and 7th cols
 # 3. remove overlapped/redundant hmm matches and keep the one with the lower e-values
 # 4. calculate the covered fraction of hmm (make sure you have downloaded the "all.hmm.ps.len" file to the same directory of this perl script)
 # 5. apply the E-value cutoff and the covered faction cutoff
 cat $1 | perl -e 'while(<>){if(/^\/\//){$x=join("",@a);($q)=($x=~/^Query:\s+(\S+)/m);while($x=~/^>> (\S+.*?\n\n)/msg){$a=$&;@c=split(/\n/,$a);$c[0]=~s/>> //;for($i=3;$i<=$#c;$i++){@d=split(/\s+/,$c[$i]);print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;}}@a=();}else{push(@a,$_);}}' \
 	| sort -k 1,1 -k 6,6n -k 7,7n | uniq \
        | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[0]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[2]<$c[2]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}' \
        | uniq | perl -e 'open(IN,"all.hmm.ps.len");while(<IN>){chomp;@a=split;$b{$a[0]}=$a[1];}while(<>){chomp;@a=split;$r=($a[4]-$a[3])/$b{$a[1]};print $_."\t".$r."\n";}' \
 	| perl -e 'while(<>){@a=split(/\t/,$_);if(($a[-1]-$a[-2])>80){print $_ if $a[2]<1e-5;}else{print $_ if $a[2]<1e-3;}}' | awk '$NF>0.3'
diff --git a/TestResults.zip b/TestResults.zip
	#!/usr/bin/env perl
	while(<>)
	{
	@a=split(/\t/,$_);
	print "\n======================================================================================================\n";
	print "a[-2] = $a[-2] \n";
	print "a[-1] = $a[-1]";
	print "(a[-1]-a[-2]) = ".($a[-1]-$a[-2])." \n";

	if(($a[-1]-$a[-2])>80)
	{
	print "a[-1]-a[-2]>80 = TRUE \n\n";
	if(not ($a[2]<1e-5))
	{
	print "Domain alignment should be deleted.\n";
	print "E-Value >= 1e-5\n\n";
	}
	}
	else
	{
	print "a[-1]-a[-2]>80 = FALSE \n\n";
	if(not ($a[2]<1e-3))
	{
	print "Domain alignment should be deleted.\n";
	print "E-Value >= 1e-3\n\n";
	}

	}

	print "Prot.ID\t\tHMM.Model\t\tEValue\tHMM.From HMM.To\tAli.From Ali.To\tHMM.Cover\n";
	print "-----------------------------------------------------------------------------------------------------\n";
	if(($a[-1]-$a[-2])>80)
	{
	print $_ if $a[2]<1e-5;
	}
	else
	{
	print $_ if $a[2]<1e-3;
	}
	}
	# Expanded Perl (This perl does not work!)

	# cat $1 \| perl -e
	while(<>)
	{
	if(/^\/\//)
	{
	$x=join("",@a);
	($q)=($x=~/^Query:\s+(\S+)/m); # Grabs query name

	while($x=~/^>> (\S+.*?\n\n)/msg) # mutiline, "." matches newline, global
	{
	$a=$&;
	@c=split(/\n/,$a);
	$c[0]=~s/>> //;
	for($i=3;$i<=$#c;$i++)
	{
	@d=split(/\s+/,$c[$i]);
	print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;
	}
	}
	@a=();
	}
	else
	{
	push(@a,$_);
	}
	}

	#\| sort -k 1,1 -k 6,6n -k 7,7n \| uniq \\| perl -e
	while(<>)
	{
	chomp;
	@a=split;
	next if $a[-1]==$a[-2];
	push(@{$b{$a[0]}},$_); # Removes lines with same start and end
	}

	foreach(sort keys %b)
	{
	@a=@{$b{$_}};

	for($i = 0; $i < $#a; $i++)
	{
	@b = split(/\t/, $a[$i]); # Alignment 1 = top alignment
	@c = split(/\t/, $a[$i+1]); # Alignment 2 = bottom alignment
	$len1 = $b[-1] - $b[-2]; # Length one = aligment 1 end - aligment 1 start
	$len2 = $c[-1] - $c[-2]; # Length two = aligment 2 end - aligment 2 start
	$len3 = $b[-1] - $c[-2]; # Length one = aligment 1 end - aligment 2 start

	if($len3 > 0 and ($len3 / $len1 > 0.5 or $len3 / $len2 > 0.5)) # if alignments are overlaped and the overlap is greater than 50% the length of an alignment.
	{
	if($b[2] < $c[2]) # Checks E value. Removes the alignment with the lowest Evalue.
	{
	splice(@a, $i + 1, 1);
	}
	else
	{
	splice(@a, $i, 1);
	}
	$i = $i - 1;
	}
	}
	foreach(@a)
	{
	print $_ . "\n";
	}
	}

	# \| uniq \| perl -e
	open(IN,"all.hmm.ps.len");
	while(<IN>)
	{
	chomp;
	@a=split;
	$b{$a[0]}=$a[1]; # creates hash of hmmName : hmmLength
	}

	while(<>)
	{
	chomp;
	@a=split;
	$r=($a[4]-$a[3])/$b{$a[1]}; # $a[4] = hmm end $a[3] = hmm start ; $b{$a[1]} = result of the hash of the name of the hmm (hmm length).
	print $_."\t".$r."\n";
	}

	# \| perl -e
	while(<>)
	{
	@a=split(/\t/,$_);
	if(($a[-1]-$a[-2])>80)
	{
	print $_ if $a[2]<1e-5;
	}
	else
	{
	print $_ if $a[2]<1e-3;
	}
	}
	# awk '$NF>0.3' # Deletes alignment coverages less than 1/3
	#!/usr/bin/env sh

	# This is the original script and is still faulty! Do not use!

	# Yanbin Yin
	# 08/18/2011
	# hmmscan output parser
	# Usage: sh hmmscan-parser.sh hmmscan-output-file

	# 1. take hmmer3 output and generate the tabular output
	# 2. sort on the 6th and 7th cols
	# 3. remove overlapped/redundant hmm matches and keep the one with the lower e-values
	# 4. calculate the covered fraction of hmm (make sure you have downloaded the "all.hmm.ps.len" file to the same directory of this perl script)
	# 5. apply the E-value cutoff and the covered faction cutoff
	cat $1 \| perl -e 'while(<>){if(/^\/\//){$x=join("",@a);($q)=($x=~/^Query:\s+(\S+)/m);while($x=~/^>> (\S+.*?\n\n)/msg){$a=$&;@c=split(/\n/,$a);$c[0]=~s/>> //;for($i=3;$i<=$#c;$i++){@d=split(/\s+/,$c[$i]);print $q."\t".$c[0]."\t$d[6]\t$d[7]\t$d[8]\t$d[10]\t$d[11]\n" if $d[6]<1;}}@a=();}else{push(@a,$_);}}' \
	\| sort -k 1,1 -k 6,6n -k 7,7n \| uniq \
	\| perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[0]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[2]<$c[2]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}' \
	\| uniq \| perl -e 'open(IN,"all.hmm.ps.len");while(<IN>){chomp;@a=split;$b{$a[0]}=$a[1];}while(<>){chomp;@a=split;$r=($a[4]-$a[3])/$b{$a[1]};print $_."\t".$r."\n";}' \
	\| perl -e 'while(<>){@a=split(/\t/,$_);if(($a[-1]-$a[-2])>80){print $_ if $a[2]<1e-5;}else{print $_ if $a[2]<1e-3;}}' \| awk '$NF>0.3'