standage · December 17, 2015 09:19
diff --git a/sample-gc.pl b/sample-gc.pl
 #!/usr/bin/env perl

 # Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
 # copyright notice and this permission notice appear in all copies.
 #
 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

 use strict;
 use Bio::SeqIO;
 use File::Basename;
 use Getopt::Long;

 sub print_usage
 {
  my $OUT = shift(@_);
  print $OUT "
 Sample random subsequences from a set of sequences, calculate their GC content
 Usage: perl ". basename($0) ." <options> seqs1.fa seqs2.fa ...
  Options:
    -d|--distribution: FILE   plot the normalized distribution of subsequence
                              positions in PNG format to the given file; this
                              can be used to verify there is no sampling bias
    -h|--help                 print this help message and exit
    -l|--lengthfilter: INT    ignore sequences whose length does not satisfy
                              this criterion; default is 1000000; note that
                              ambiguous nucleotides (those that are not in the
                              alphabet {ATWGCS}) will not count toward the
                              sequence length
    -n|--numsamples: INT      number of subsequences to randomly sample from
                              each sequence; default is 5
    -s|--subseqlength: INT    randomly sample subsequences from each sequence of
                              the given length; default is 20000
    -x|--maxambig: INT        maximum number of ambiguous nucleotides allowed in
                              each subsequence; default is 200
 ";
 }

 my $distfile     = "";
 my $DIST;
 my $lengthfilter = 1000000;
 my $numsamples   = 5;
 my $subseqlength = 20000;
 my $maxambig     = 200;
 GetOptions
 (
  "d|distribution=s" => \$distfile,
  "h|help"           => sub{ print_usage(\*STDOUT); exit(0); },
  "m|lengthfilter=i" => \$lengthfilter,
  "n|numsamples=i"   => \$numsamples,
  "s|subseqlength=i" => \$subseqlength,
  "x|maxambig=i"     => \$maxambig,
 );

 if($distfile ne "")
 {
  if($distfile !~ m/\.png$/)
  {
    printf(STDERR "'%s' does not end with .png, adding extension\n", $distfile);
    $distfile .= ".png";
  }
  open($DIST, ">", "$distfile.csv") or die("error opening file '$distfile.csv' $!");
 }

 printf("%s\n", join(",", qw(Genome Sequence GCcontent Location)));
 foreach my $file(@ARGV)
 {
  my($label, $dir, $ext) = fileparse($file, qr/\.[^.]*/);
  my $start = time();
  printf(STDERR "Sequences from %s\n", $label);
  my $nseqs = 0;
  my $nsubseqs = 0;
  
  my $seqloader = Bio::SeqIO->new("-file" => $file, "-format" => "Fasta");
  while(my $seq = $seqloader->next_seq())
  {
    my $sequence = $seq->seq();
    
    my $gc = $sequence =~ tr/GCS//; # S is IUPAC ambiguity symbol for G or C
    my $at = $sequence =~ tr/ATW//; # W is IUPAC ambiguity symbol for A or T
    if($gc + $at >= $lengthfilter)
    {
      $nseqs += 1;
      my $numsubseqs = 0;
      while($numsubseqs < $numsamples)
      {
        my $pos = int rand($seq->length() - $subseqlength + 1);
        my $subseq = substr($sequence, $pos, $subseqlength);
        my $numambig =()= $subseq =~ m/[^ACGTSW]/g;
        next if($subseq =~ /\QX/ or $numambig > $maxambig);
        
        my $subgc = $subseq =~ tr/GCS//;
        my $subat = $subseq =~ tr/ATW//;
        my $gccontent = $subgc / ($subgc + $subat);
        my $normpos = $pos / ($seq->length() - $subseqlength + 1);
        my $loc = sprintf("%s-%s[%lu-%lu]", $label, $seq->id, $pos,
                          $pos + $subseqlength - 1);
        printf("%s,%s-%s,%.4f,%s\n", $label, $label, $seq->id, $gccontent, $loc);
        printf($DIST "%s,%.6f\n", $loc, $normpos) if($distfile ne "");
        
        substr($sequence, $pos, $subseqlength, "X" x $subseqlength);
        $numsubseqs += 1;
        $nsubseqs += 1;
      }
    }
  }
  $seqloader->close();
  my $elapsed = time() - $start;
  printf(STDERR "    %d sequences met length criterion\n", $nseqs);
  printf(STDERR "    sampled a total of %d subsequences\n", $nsubseqs);
  printf(STDERR "    elapsed seconds: %d\n", $elapsed);
 }

 if($distfile ne "")
 {
  close($DIST);
  my $rscript = "data <- read.csv(\"$distfile.csv\", header=FALSE); ".
                "png(filename=\"$distfile\"); ".
                "hist(data\$V2, breaks=100, col=\"#ffeeee\", border=\"red\", ".
                     "main=\"\", xlab=\"Position in sequence\"); ".
                "device <- dev.off()";
  system("Rscript --vanilla -e '$rscript' 1>&2 && rm $distfile.csv");
 }
	#!/usr/bin/env perl

	# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	use strict;
	use Bio::SeqIO;
	use File::Basename;
	use Getopt::Long;

	sub print_usage
	{
	my $OUT = shift(@_);
	print $OUT "
	Sample random subsequences from a set of sequences, calculate their GC content
	Usage: perl ". basename($0) ." <options> seqs1.fa seqs2.fa ...
	Options:
	-d\|--distribution: FILE plot the normalized distribution of subsequence
	positions in PNG format to the given file; this
	can be used to verify there is no sampling bias
	-h\|--help print this help message and exit
	-l\|--lengthfilter: INT ignore sequences whose length does not satisfy
	this criterion; default is 1000000; note that
	ambiguous nucleotides (those that are not in the
	alphabet {ATWGCS}) will not count toward the
	sequence length
	-n\|--numsamples: INT number of subsequences to randomly sample from
	each sequence; default is 5
	-s\|--subseqlength: INT randomly sample subsequences from each sequence of
	the given length; default is 20000
	-x\|--maxambig: INT maximum number of ambiguous nucleotides allowed in
	each subsequence; default is 200
	";
	}

	my $distfile = "";
	my $DIST;
	my $lengthfilter = 1000000;
	my $numsamples = 5;
	my $subseqlength = 20000;
	my $maxambig = 200;
	GetOptions
	(
	"d\|distribution=s" => \$distfile,
	"h\|help" => sub{ print_usage(\*STDOUT); exit(0); },
	"m\|lengthfilter=i" => \$lengthfilter,
	"n\|numsamples=i" => \$numsamples,
	"s\|subseqlength=i" => \$subseqlength,
	"x\|maxambig=i" => \$maxambig,
	);

	if($distfile ne "")
	{
	if($distfile !~ m/\.png$/)
	{
	printf(STDERR "'%s' does not end with .png, adding extension\n", $distfile);
	$distfile .= ".png";
	}
	open($DIST, ">", "$distfile.csv") or die("error opening file '$distfile.csv' $!");
	}

	printf("%s\n", join(",", qw(Genome Sequence GCcontent Location)));
	foreach my $file(@ARGV)
	{
	my($label, $dir, $ext) = fileparse($file, qr/\.[^.]*/);
	my $start = time();
	printf(STDERR "Sequences from %s\n", $label);
	my $nseqs = 0;
	my $nsubseqs = 0;

	my $seqloader = Bio::SeqIO->new("-file" => $file, "-format" => "Fasta");
	while(my $seq = $seqloader->next_seq())
	{
	my $sequence = $seq->seq();

	my $gc = $sequence =~ tr/GCS//; # S is IUPAC ambiguity symbol for G or C
	my $at = $sequence =~ tr/ATW//; # W is IUPAC ambiguity symbol for A or T
	if($gc + $at >= $lengthfilter)
	{
	$nseqs += 1;
	my $numsubseqs = 0;
	while($numsubseqs < $numsamples)
	{
	my $pos = int rand($seq->length() - $subseqlength + 1);
	my $subseq = substr($sequence, $pos, $subseqlength);
	my $numambig =()= $subseq =~ m/[^ACGTSW]/g;
	next if($subseq =~ /\QX/ or $numambig > $maxambig);

	my $subgc = $subseq =~ tr/GCS//;
	my $subat = $subseq =~ tr/ATW//;
	my $gccontent = $subgc / ($subgc + $subat);
	my $normpos = $pos / ($seq->length() - $subseqlength + 1);
	my $loc = sprintf("%s-%s[%lu-%lu]", $label, $seq->id, $pos,
	$pos + $subseqlength - 1);
	printf("%s,%s-%s,%.4f,%s\n", $label, $label, $seq->id, $gccontent, $loc);
	printf($DIST "%s,%.6f\n", $loc, $normpos) if($distfile ne "");

	substr($sequence, $pos, $subseqlength, "X" x $subseqlength);
	$numsubseqs += 1;
	$nsubseqs += 1;
	}
	}
	}
	$seqloader->close();
	my $elapsed = time() - $start;
	printf(STDERR " %d sequences met length criterion\n", $nseqs);
	printf(STDERR " sampled a total of %d subsequences\n", $nsubseqs);
	printf(STDERR " elapsed seconds: %d\n", $elapsed);
	}

	if($distfile ne "")
	{
	close($DIST);
	my $rscript = "data <- read.csv(\"$distfile.csv\", header=FALSE); ".
	"png(filename=\"$distfile\"); ".
	"hist(data\$V2, breaks=100, col=\"#ffeeee\", border=\"red\", ".
	"main=\"\", xlab=\"Position in sequence\"); ".
	"device <- dev.off()";
	system("Rscript --vanilla -e '$rscript' 1>&2 && rm $distfile.csv");
	}
No results found