Skip to content

Instantly share code, notes, and snippets.

@standage
Last active December 17, 2015 09:19
Show Gist options
  • Select an option

  • Save standage/5586245 to your computer and use it in GitHub Desktop.

Select an option

Save standage/5586245 to your computer and use it in GitHub Desktop.
Given a list of Fasta files containing genomic sequences, sample random subsequences and calculate their GC content.
#!/usr/bin/env perl
# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use strict;
use Bio::SeqIO;
use File::Basename;
use Getopt::Long;
sub print_usage
{
my $OUT = shift(@_);
print $OUT "
Sample random subsequences from a set of sequences, calculate their GC content
Usage: perl ". basename($0) ." <options> seqs1.fa seqs2.fa ...
Options:
-d|--distribution: FILE plot the normalized distribution of subsequence
positions in PNG format to the given file; this
can be used to verify there is no sampling bias
-h|--help print this help message and exit
-l|--lengthfilter: INT ignore sequences whose length does not satisfy
this criterion; default is 1000000; note that
ambiguous nucleotides (those that are not in the
alphabet {ATWGCS}) will not count toward the
sequence length
-n|--numsamples: INT number of subsequences to randomly sample from
each sequence; default is 5
-s|--subseqlength: INT randomly sample subsequences from each sequence of
the given length; default is 20000
-x|--maxambig: INT maximum number of ambiguous nucleotides allowed in
each subsequence; default is 200
";
}
my $distfile = "";
my $DIST;
my $lengthfilter = 1000000;
my $numsamples = 5;
my $subseqlength = 20000;
my $maxambig = 200;
GetOptions
(
"d|distribution=s" => \$distfile,
"h|help" => sub{ print_usage(\*STDOUT); exit(0); },
"m|lengthfilter=i" => \$lengthfilter,
"n|numsamples=i" => \$numsamples,
"s|subseqlength=i" => \$subseqlength,
"x|maxambig=i" => \$maxambig,
);
if($distfile ne "")
{
if($distfile !~ m/\.png$/)
{
printf(STDERR "'%s' does not end with .png, adding extension\n", $distfile);
$distfile .= ".png";
}
open($DIST, ">", "$distfile.csv") or die("error opening file '$distfile.csv' $!");
}
printf("%s\n", join(",", qw(Genome Sequence GCcontent Location)));
foreach my $file(@ARGV)
{
my($label, $dir, $ext) = fileparse($file, qr/\.[^.]*/);
my $start = time();
printf(STDERR "Sequences from %s\n", $label);
my $nseqs = 0;
my $nsubseqs = 0;
my $seqloader = Bio::SeqIO->new("-file" => $file, "-format" => "Fasta");
while(my $seq = $seqloader->next_seq())
{
my $sequence = $seq->seq();
my $gc = $sequence =~ tr/GCS//; # S is IUPAC ambiguity symbol for G or C
my $at = $sequence =~ tr/ATW//; # W is IUPAC ambiguity symbol for A or T
if($gc + $at >= $lengthfilter)
{
$nseqs += 1;
my $numsubseqs = 0;
while($numsubseqs < $numsamples)
{
my $pos = int rand($seq->length() - $subseqlength + 1);
my $subseq = substr($sequence, $pos, $subseqlength);
my $numambig =()= $subseq =~ m/[^ACGTSW]/g;
next if($subseq =~ /\QX/ or $numambig > $maxambig);
my $subgc = $subseq =~ tr/GCS//;
my $subat = $subseq =~ tr/ATW//;
my $gccontent = $subgc / ($subgc + $subat);
my $normpos = $pos / ($seq->length() - $subseqlength + 1);
my $loc = sprintf("%s-%s[%lu-%lu]", $label, $seq->id, $pos,
$pos + $subseqlength - 1);
printf("%s,%s-%s,%.4f,%s\n", $label, $label, $seq->id, $gccontent, $loc);
printf($DIST "%s,%.6f\n", $loc, $normpos) if($distfile ne "");
substr($sequence, $pos, $subseqlength, "X" x $subseqlength);
$numsubseqs += 1;
$nsubseqs += 1;
}
}
}
$seqloader->close();
my $elapsed = time() - $start;
printf(STDERR " %d sequences met length criterion\n", $nseqs);
printf(STDERR " sampled a total of %d subsequences\n", $nsubseqs);
printf(STDERR " elapsed seconds: %d\n", $elapsed);
}
if($distfile ne "")
{
close($DIST);
my $rscript = "data <- read.csv(\"$distfile.csv\", header=FALSE); ".
"png(filename=\"$distfile\"); ".
"hist(data\$V2, breaks=100, col=\"#ffeeee\", border=\"red\", ".
"main=\"\", xlab=\"Position in sequence\"); ".
"device <- dev.off()";
system("Rscript --vanilla -e '$rscript' 1>&2 && rm $distfile.csv");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment