josephhughes · January 17, 2013 13:29
diff --git a/parse_cdhit.pl b/parse_cdhit.pl
 # use this to get the number of reads in each cluster
 use strict;
 use Getopt::Long; 
 use Bio::SeqIO;

 my ($clstr,$result,$long,%clusters,$infile);
 &GetOptions(
      'clstr:s'  =>\$clstr, #a cd-hit generated cluster file
 	    'out:s'     => \$result, # a text file with the numbers of reads in each cluster
           );

 print "Input cluster file $clstr, fatsa file $infile, output $result and fasta output $long\n";
 open(CLUSTER,"<$clstr")||die "Can't open $clstr\n";

 my $clusterid;
 my $longest;
 my %cluster;
 while(<CLUSTER>){
  if ($_=~/^>(.+)$/){
    $clusterid=$1;
    #$longest="";
    #print "$clusterid\n";
  }elsif ($_=~/\d+.+\>(.+)\.\.\..+\*$/){
    my $id=$1;
    #print "$id\n";
    $longest=$id;
    $clusters{$clusterid}{$id}=$longest;
    #print "$clusterid $longest\n";
  }
  elsif ($_=~/\d+.+\>(.+)\.\.\..+$/){
    my $id=$1;
    #print "$id\n";
    $clusters{$clusterid}{$id}=$longest;
    #print "clusterid\t$id Shorter than $longest\n";
  }
 }
 open(OUT,">$result")||die "Can't open $result\n";
 print OUT "ClusterID\tNbSeqs\n";
 foreach my $clid (keys %clusters){
  my $nbseqs=keys %{$clusters{$clid}};
  print OUT "$clid\t$nbseqs\n";
 }
	# use this to get the number of reads in each cluster
	use strict;
	use Getopt::Long;
	use Bio::SeqIO;

	my ($clstr,$result,$long,%clusters,$infile);
	&GetOptions(
	'clstr:s' =>\$clstr, #a cd-hit generated cluster file
	'out:s' => \$result, # a text file with the numbers of reads in each cluster
	);

	print "Input cluster file $clstr, fatsa file $infile, output $result and fasta output $long\n";
	open(CLUSTER,"<$clstr")\|\|die "Can't open $clstr\n";

	my $clusterid;
	my $longest;
	my %cluster;
	while(<CLUSTER>){
	if ($_=~/^>(.+)$/){
	$clusterid=$1;
	#$longest="";
	#print "$clusterid\n";
	}elsif ($_=~/\d+.+\>(.+)\.\.\..+\*$/){
	my $id=$1;
	#print "$id\n";
	$longest=$id;
	$clusters{$clusterid}{$id}=$longest;
	#print "$clusterid $longest\n";
	}
	elsif ($_=~/\d+.+\>(.+)\.\.\..+$/){
	my $id=$1;
	#print "$id\n";
	$clusters{$clusterid}{$id}=$longest;
	#print "clusterid\t$id Shorter than $longest\n";
	}
	}
	open(OUT,">$result")\|\|die "Can't open $result\n";
	print OUT "ClusterID\tNbSeqs\n";
	foreach my $clid (keys %clusters){
	my $nbseqs=keys %{$clusters{$clid}};
	print OUT "$clid\t$nbseqs\n";
	}
No results found