Created
January 17, 2013 13:29
-
-
Save josephhughes/4555921 to your computer and use it in GitHub Desktop.
Use this script to get the number of reads in each cluster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# use this to get the number of reads in each cluster | |
use strict; | |
use Getopt::Long; | |
use Bio::SeqIO; | |
my ($clstr,$result,$long,%clusters,$infile); | |
&GetOptions( | |
'clstr:s' =>\$clstr, #a cd-hit generated cluster file | |
'out:s' => \$result, # a text file with the numbers of reads in each cluster | |
); | |
print "Input cluster file $clstr, fatsa file $infile, output $result and fasta output $long\n"; | |
open(CLUSTER,"<$clstr")||die "Can't open $clstr\n"; | |
my $clusterid; | |
my $longest; | |
my %cluster; | |
while(<CLUSTER>){ | |
if ($_=~/^>(.+)$/){ | |
$clusterid=$1; | |
#$longest=""; | |
#print "$clusterid\n"; | |
}elsif ($_=~/\d+.+\>(.+)\.\.\..+\*$/){ | |
my $id=$1; | |
#print "$id\n"; | |
$longest=$id; | |
$clusters{$clusterid}{$id}=$longest; | |
#print "$clusterid $longest\n"; | |
} | |
elsif ($_=~/\d+.+\>(.+)\.\.\..+$/){ | |
my $id=$1; | |
#print "$id\n"; | |
$clusters{$clusterid}{$id}=$longest; | |
#print "clusterid\t$id Shorter than $longest\n"; | |
} | |
} | |
open(OUT,">$result")||die "Can't open $result\n"; | |
print OUT "ClusterID\tNbSeqs\n"; | |
foreach my $clid (keys %clusters){ | |
my $nbseqs=keys %{$clusters{$clid}}; | |
print OUT "$clid\t$nbseqs\n"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment