Created
November 28, 2012 01:38
-
-
Save sahrendt0/4158493 to your computer and use it in GitHub Desktop.
GEN 220 Week 8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Script: hw8.pl | |
# Description: Trim adapter, clean reads | |
# Author: Steven Ahrendt | |
# email: [email protected] | |
################################## | |
# Adapter: CTGTAGGCACCATCAAT | |
# trim if it contains a perfect match to first 6 bases | |
################################## | |
use warnings; | |
use strict; | |
use Bio::Seq; | |
use Bio::SeqIO; | |
my $infile = shift; | |
my ($seq_obj,$seq_str,%seqs); | |
my $adapter = "CTGTAGGCACCATCAAT"; | |
my $a_m = 6; #Seq must match first 6 bases of adapter for trimming | |
my $match = substr($adapter,0,$a_m); | |
my $c = 0; | |
my $total = 0; | |
my $clean = 0; | |
open(my $fh => "zcat $infile |") || die $!; | |
while(<$fh>) | |
{ | |
chomp $_; | |
if($_ =~ m/^>/) | |
{ | |
if($c != 0) | |
{ | |
$seq_obj->seq($seq_str); | |
$total++; | |
if ((my $a_pos = adapterFind($seq_obj->seq)) > -1) | |
{ | |
$seq_obj->seq(substr($seq_str,0,$a_pos)); | |
if ($seq_obj->seq !~ m/N+/g) | |
{ | |
if($seq_obj->length >= 18) | |
{ | |
$seqs{$seq_obj->display_id} = $seq_obj; | |
$clean++; | |
#push(@seqs,$seq_obj); | |
} | |
} | |
} | |
} | |
my $header = substr($_,1); | |
$seq_obj = Bio::Seq->new(-display_id => $header); | |
$seq_str = ""; | |
} | |
else | |
{ | |
$seq_str = $_; | |
} | |
$c++; | |
} | |
$seq_obj->seq($seq_str); | |
$total++; | |
if(($seq_obj->length >= 18) && ($seq_obj->seq !~ m/N+/g) && ($seq_obj->seq =~ m/^$match/)) | |
{ | |
$seqs{$seq_obj->display_id} = $seq_obj; | |
$clean++; | |
} | |
#push(@seqs,$seq_obj); | |
close($fh); | |
open(RES,">results"); | |
print RES "$total read(s)\n"; | |
print RES "$clean clean read(s)\n"; | |
#print $seqs[0]->seq,"\n"; | |
open(SIZES,">sizes"); | |
my $clean_reads = Bio::SeqIO->new(-file => ">clean_reads.fa", | |
-format => "fasta"); | |
foreach my $key (keys %seqs) | |
{ | |
print SIZES $seqs{$key}->length,"\n"; | |
$clean_reads->write_seq($seqs{$key}); | |
} | |
close(SIZES); | |
## Step 2: Cluster | |
my %cluster; | |
my $unique_reads = Bio::SeqIO->new(-file => ">unique_reads.fa", | |
-format => "fasta"); | |
my $unique = 0; | |
foreach my $key (keys %seqs) | |
{ | |
$cluster{ $seqs{$key}->seq} = $seqs{$key}; | |
$unique_reads->write_seq($cluster{$seqs{$key}->seq}); | |
$unique++; | |
} | |
print RES "$unique unique read(s)\n"; | |
close(RES); | |
sub adapterFind { | |
my $seq = shift; | |
my $pos = -1; | |
my $len = -1; | |
while(($pos = index($seq,$match,$pos)) > -1) | |
{ | |
$len = $pos; | |
$pos++; | |
} | |
return $len; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Script: mirbase_comp.pl | |
# Description: GEN220 HW8, compare unique reads to mirbase | |
# Author: Steven Ahrendt | |
# email: [email protected] | |
##################################### | |
use strict; | |
use warnings; | |
use Bio::Seq; | |
use Bio::SeqIO; | |
use List::MoreUtils 'true'; | |
#my $count = true { /pattern/ } @list_of_strings; | |
my $mirfile = $ARGV[0]; | |
my $reads_file = $ARGV[1]; | |
my (%mirnas,%clean_reads); | |
my (@m_seqs, @c_seqs); | |
%mirnas = readFile($mirfile); | |
%clean_reads = readFile($reads_file); | |
foreach my $c_key (keys %clean_reads) | |
{ | |
push(@c_seqs, $clean_reads{$c_key}->seq); | |
} | |
open(OUT,">outfile"); | |
foreach my $keym (keys %mirnas) | |
{ | |
my $mseq = $mirnas{$keym}->seq; | |
$mseq =~ s/U/T/g; | |
next if (!(my $match = true {/$mseq/} @c_seqs)); | |
my $norm = ($match*10000000)/(scalar @c_seqs); # RPTM | |
print "$keym\t$norm\n"; | |
print OUT "$keym\t$norm\n"; | |
} | |
close(OUT); | |
## Subroutine: readFile | |
# Input: Name of a fasta file (**can be gzipped**) | |
# Returns: Hash of Bio::Seq objects with display_id as hash key | |
sub readFile { | |
my (%seq_hash,$seq_str,$seq_obj,$header); | |
my $infile = shift; | |
#print $infile,"\n"; | |
if((split(/\./,$infile))[-1] eq "gz") | |
{ | |
open(IN => "zcat $infile |") || die $!; | |
my $c=0; | |
foreach my $line (<IN>) | |
{ | |
chomp $line; | |
#print $line,"\n"; | |
if($line =~ m/^>/) | |
{ | |
if($c != 0) | |
{ | |
$seq_obj->seq($seq_str); | |
$seq_hash{$seq_obj->display_id} = $seq_obj; | |
} | |
$header = substr($line,1); | |
$seq_obj = Bio::Seq->new(-display_id => $header); | |
$seq_str = ""; | |
} | |
else | |
{ | |
$seq_str = $line; #join("",$seq_str,$_); | |
} | |
$c++; | |
} | |
$seq_obj->seq($seq_str); | |
$seq_hash{$seq_obj->display_id} = $seq_obj; | |
close(IN); | |
} | |
else | |
{ | |
my $seqIO_obj = Bio::SeqIO->new(-file => "<$infile", | |
-format => "fasta"); | |
while($seq_obj = $seqIO_obj->next_seq) | |
{ | |
$seq_hash{$seq_obj->display_id} = $seq_obj; | |
} | |
} | |
return %seq_hash; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Script: seqlen.pl | |
# Description: Extracts sequence lengths from a fasta file | |
# Author: Steven Ahrendt | |
# email: [email protected] | |
# Date: 6.17.11 | |
##################################################### | |
# Usage: seqlen.pl [-v] fastafile | |
##################################################### | |
use strict; | |
use warnings; | |
use Bio::SeqIO; | |
use List::Util qw[min max]; | |
my $screen_out = 0; | |
# Handle command-line options | |
use Getopt::Std; | |
my %opts; | |
getopts('v', \%opts); | |
if (exists $opts{'v'}) | |
{ | |
$screen_out = 1; | |
} | |
my (@tmp,@len); | |
my @lengths; | |
my $infile = $ARGV[0]; | |
print `perl -pi -e 's/\r/\n/g' $infile`; | |
my $numseqs = 0; | |
my $fastafile = Bio::SeqIO->new(-file=>"<$infile", -format=>"fasta"); | |
my $csvfile = join(".",infilename($infile),"csv"); | |
open(OUT,">$csvfile"); | |
while(my $seq = $fastafile->next_seq) | |
{ | |
push(@lengths,$seq->length); | |
print OUT $seq->display_id,",",$seq->length,"\n"; | |
if($screen_out) | |
{ | |
print $seq->display_id,",",$seq->length,"\n"; | |
} | |
} | |
close(OUT); | |
$numseqs = @lengths; | |
print "# Seqs:\t$numseqs\n"; | |
print "Min:\t",min(@lengths),"\n"; | |
print "Max:\t",max(@lengths),"\n"; | |
printf("Mean:\t%.2f\n",mean(@lengths)); | |
if($numseqs == 1) | |
{ | |
printf("Stdv:\tNA\n"); | |
} | |
else | |
{ | |
printf("Stdv:\t%.2f\n",stddev(@lengths)); | |
} | |
print "-----\n"; | |
print "Lengths are in \"$csvfile\"\n"; | |
print "-----\n"; | |
sub stddev { | |
my @a = @_; | |
my $n = @a; | |
my $c = (sum(@a) ** 2)/$n; | |
my $num = sumsqr(@a) - $c; | |
#print "\n"; | |
#print sumsqr(@a); | |
#print " - "; | |
#print "(",sum(@a),"^2)/($n)\n"; | |
#print "-----------------------\n"; | |
#print $n,"-1\n"; | |
return ($num/($n-1)) ** 0.5; | |
} | |
sub mean { | |
my @a = @_; | |
my $n = @a; | |
return (sum(@a)/$n); | |
} | |
sub sumsqr { | |
my @a = @_; | |
my $ret = 0; | |
foreach my $i (@a) | |
{ | |
$ret += ($i**2); | |
} | |
return $ret; | |
} | |
sub sum { | |
my @a = @_; | |
my $ret = 0; | |
foreach my $i (@a) | |
{ | |
$ret += $i; | |
} | |
return $ret; | |
} | |
sub round { | |
my($number) = shift; | |
return int($number + .5); | |
} | |
sub infilename { | |
my $file = shift; | |
return substr($file,0,index($file,".")); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment