Last active
December 17, 2015 09:19
-
-
Save standage/5586245 to your computer and use it in GitHub Desktop.
Given a list of Fasta files containing genomic sequences, sample random subsequences and calculate their GC content.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| # Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com> | |
| # | |
| # Permission to use, copy, modify, and/or distribute this software for any | |
| # purpose with or without fee is hereby granted, provided that the above | |
| # copyright notice and this permission notice appear in all copies. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
| # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
| # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
| # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
| # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
| # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
| # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
| use strict; | |
| use Bio::SeqIO; | |
| use File::Basename; | |
| use Getopt::Long; | |
| sub print_usage | |
| { | |
| my $OUT = shift(@_); | |
| print $OUT " | |
| Sample random subsequences from a set of sequences, calculate their GC content | |
| Usage: perl ". basename($0) ." <options> seqs1.fa seqs2.fa ... | |
| Options: | |
| -d|--distribution: FILE plot the normalized distribution of subsequence | |
| positions in PNG format to the given file; this | |
| can be used to verify there is no sampling bias | |
| -h|--help print this help message and exit | |
| -l|--lengthfilter: INT ignore sequences whose length does not satisfy | |
| this criterion; default is 1000000; note that | |
| ambiguous nucleotides (those that are not in the | |
| alphabet {ATWGCS}) will not count toward the | |
| sequence length | |
| -n|--numsamples: INT number of subsequences to randomly sample from | |
| each sequence; default is 5 | |
| -s|--subseqlength: INT randomly sample subsequences from each sequence of | |
| the given length; default is 20000 | |
| -x|--maxambig: INT maximum number of ambiguous nucleotides allowed in | |
| each subsequence; default is 200 | |
| "; | |
| } | |
| my $distfile = ""; | |
| my $DIST; | |
| my $lengthfilter = 1000000; | |
| my $numsamples = 5; | |
| my $subseqlength = 20000; | |
| my $maxambig = 200; | |
| GetOptions | |
| ( | |
| "d|distribution=s" => \$distfile, | |
| "h|help" => sub{ print_usage(\*STDOUT); exit(0); }, | |
| "m|lengthfilter=i" => \$lengthfilter, | |
| "n|numsamples=i" => \$numsamples, | |
| "s|subseqlength=i" => \$subseqlength, | |
| "x|maxambig=i" => \$maxambig, | |
| ); | |
| if($distfile ne "") | |
| { | |
| if($distfile !~ m/\.png$/) | |
| { | |
| printf(STDERR "'%s' does not end with .png, adding extension\n", $distfile); | |
| $distfile .= ".png"; | |
| } | |
| open($DIST, ">", "$distfile.csv") or die("error opening file '$distfile.csv' $!"); | |
| } | |
| printf("%s\n", join(",", qw(Genome Sequence GCcontent Location))); | |
| foreach my $file(@ARGV) | |
| { | |
| my($label, $dir, $ext) = fileparse($file, qr/\.[^.]*/); | |
| my $start = time(); | |
| printf(STDERR "Sequences from %s\n", $label); | |
| my $nseqs = 0; | |
| my $nsubseqs = 0; | |
| my $seqloader = Bio::SeqIO->new("-file" => $file, "-format" => "Fasta"); | |
| while(my $seq = $seqloader->next_seq()) | |
| { | |
| my $sequence = $seq->seq(); | |
| my $gc = $sequence =~ tr/GCS//; # S is IUPAC ambiguity symbol for G or C | |
| my $at = $sequence =~ tr/ATW//; # W is IUPAC ambiguity symbol for A or T | |
| if($gc + $at >= $lengthfilter) | |
| { | |
| $nseqs += 1; | |
| my $numsubseqs = 0; | |
| while($numsubseqs < $numsamples) | |
| { | |
| my $pos = int rand($seq->length() - $subseqlength + 1); | |
| my $subseq = substr($sequence, $pos, $subseqlength); | |
| my $numambig =()= $subseq =~ m/[^ACGTSW]/g; | |
| next if($subseq =~ /\QX/ or $numambig > $maxambig); | |
| my $subgc = $subseq =~ tr/GCS//; | |
| my $subat = $subseq =~ tr/ATW//; | |
| my $gccontent = $subgc / ($subgc + $subat); | |
| my $normpos = $pos / ($seq->length() - $subseqlength + 1); | |
| my $loc = sprintf("%s-%s[%lu-%lu]", $label, $seq->id, $pos, | |
| $pos + $subseqlength - 1); | |
| printf("%s,%s-%s,%.4f,%s\n", $label, $label, $seq->id, $gccontent, $loc); | |
| printf($DIST "%s,%.6f\n", $loc, $normpos) if($distfile ne ""); | |
| substr($sequence, $pos, $subseqlength, "X" x $subseqlength); | |
| $numsubseqs += 1; | |
| $nsubseqs += 1; | |
| } | |
| } | |
| } | |
| $seqloader->close(); | |
| my $elapsed = time() - $start; | |
| printf(STDERR " %d sequences met length criterion\n", $nseqs); | |
| printf(STDERR " sampled a total of %d subsequences\n", $nsubseqs); | |
| printf(STDERR " elapsed seconds: %d\n", $elapsed); | |
| } | |
| if($distfile ne "") | |
| { | |
| close($DIST); | |
| my $rscript = "data <- read.csv(\"$distfile.csv\", header=FALSE); ". | |
| "png(filename=\"$distfile\"); ". | |
| "hist(data\$V2, breaks=100, col=\"#ffeeee\", border=\"red\", ". | |
| "main=\"\", xlab=\"Position in sequence\"); ". | |
| "device <- dev.off()"; | |
| system("Rscript --vanilla -e '$rscript' 1>&2 && rm $distfile.csv"); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment