|
#!/usr/bin/perl |
|
use warnings; |
|
use strict; |
|
use Data::Dumper; |
|
use Getopt::Std; |
|
use Scalar::Util qw(looks_like_number); |
|
|
|
&main(); |
|
|
|
sub main { |
|
#use List::Util qw/max/; my $in; |
|
|
|
my $use="$0 [-q 0 ] -a LINKER reads_1.in.fq.gz reads.out.fq.gz [reads_2.in.fq.gz ] . |
|
Splits reads by LINKER Sequence. Due to PE difficulties writes to single file. PE is experimental.\n"; |
|
#open/check some required files |
|
|
|
my $opts; |
|
getopts('q:a:', \%{$opts}); |
|
die $use."\n" if(not(scalar(@ARGV) == 2 || (scalar(@ARGV) == 3))); |
|
|
|
die "[FATAL] if '-q INT' is specified use integer for INT".$use if(defined($opts -> {'q'}) && not(looks_like_number($opts -> {'q'}))); |
|
|
|
die "[FATAL] linkersequence does not match [ATCGNatcgn]" if(not($opts -> {'a'} =~ m/^[ATCGNatcgn]+$/)); |
|
#se |
|
|
|
#open handles fastq files |
|
|
|
open(my $fastq1Handle,"-|",'gzip -dc '.$ARGV[0]) |
|
or die "[FATAL] Cannot read open fq1 file ".$ARGV[0]; |
|
|
|
defined($ARGV[1]) or die "Outfile reads_1.out.fq.gz not specified!"; |
|
open(my $fastq1OutHandle,"|-",'gzip -c > '.$ARGV[1]) |
|
or die "[FATAL] Cannot write fq1 file ".$ARGV[1]; |
|
|
|
my $fastq2Handle; |
|
my $fastq2OutHandle; |
|
|
|
if(defined($ARGV[2]) && -e $ARGV[2] && defined($ARGV[3])){ |
|
|
|
open($fastq2Handle,"-|",'gzip -dc '.$ARGV[2]) |
|
or die "Cannot read open fq2 file ".$ARGV[2]; |
|
defined($ARGV[3]) or die "Outfile reads_2.out.fq.gz not specified!"; |
|
} |
|
|
|
|
|
|
|
warn "## ". localtime(time()). " ## INFO ## Starting iteration.\n"; |
|
my $timelast = time(); |
|
|
|
my $stats; |
|
|
|
|
|
#if reading record fails from both fq files then end |
|
|
|
while( ( (not(defined($ARGV[2]))) || (my $fq2 = ReadFastq(\$fastq2Handle)) ) && (my $fq1 = ReadFastq(\$fastq1Handle)) ){ |
|
#progress |
|
if($. % 1000 == 0){ |
|
my $time = time(); |
|
if($time % 20 == 0 && $time != $timelast){ |
|
$timelast = $time; |
|
warn "## ". localtime($time). " ## INFO ## Currently at line ". $. ."\n"; |
|
} |
|
} |
|
|
|
$stats -> {'recordcount'}++; |
|
if(not(defined($opts -> {'q'})) || TestQual($opts -> {'q'},$fq1)){ |
|
$stats -> {'passcount'}++; |
|
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq1); |
|
|
|
if(defined($ARGV[3]) && -e $ARGV[3]){ |
|
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq2); |
|
} |
|
} |
|
} |
|
if( ( (defined($ARGV[2]) && -e $ARGV[2] ) && (my $fq2 = ReadFastq(\$fastq2Handle)) ) || (my $fq1 = ReadFastq(\$fastq1Handle)) ){ |
|
unlink($ARGV[1])if(-e $ARGV[1]); |
|
unlink($ARGV[3])if(-e $ARGV[3]); |
|
die "Some fastq handles are still readable:This program produced invalid output. Check your fastq files!"; |
|
} |
|
warn "## ". localtime(time()). " ## INFO ## Done with " . $stats -> {'recordcount'} . " records processed and " .$stats -> {'passcount'} . " records passed at line ". $. ."\n"; |
|
} |
|
sub SplitFqByLinker { |
|
my $handle = shift @_; |
|
my $linker = shift @_; |
|
my $fq = shift @_; |
|
#my $fq2; |
|
#$fq2 = shift @_ if(scalar(@_)>0); |
|
|
|
#index print in header just to get good data |
|
my $fqLength = length($fq -> [1]); |
|
my $fqBase; |
|
my $fqInitLength; |
|
if ($fq -> [0] =~ m/(.*)_(\d+)$/){ |
|
$fqBase=$1; |
|
$fqInitLength=$2; |
|
}else{ |
|
$fqInitLength= $fqLength; |
|
$fqBase = $fq -> [0]; |
|
$fq -> [0] = $fq -> [0] . '_' . $fqLength; |
|
} |
|
|
|
my $index = index($fq ->[1],$linker); |
|
if( $index==-1){ |
|
WriteFastq($handle , $fq) if($fqLength > 0); |
|
}elsif($index>-1){ |
|
my $fqs;#fqsubset |
|
#$fqs ->[0] = $fq ->[0]; |
|
#$fqs ->[0] = $fqBase.'_'. ($fqInitLenght - )+ $index; |
|
$fqs->[1]=substr($fq ->[1],0,$index); |
|
$fqs->[2]=substr($fq ->[2],0,$index); |
|
$fq->[1]=substr($fq ->[1],$index + length($linker)); |
|
$fq->[2]=substr($fq ->[2],$index + length($linker)); |
|
|
|
$fqs ->[0] = $fqBase.'_'. (($fqInitLength - $fqLength) + $index); |
|
|
|
WriteFastq($handle , $fqs) if (length($fqs->[1])>0); |
|
SplitFqByLinker($handle,$linker,$fq); |
|
} |
|
} |
|
|
|
sub ReadFastq { |
|
my $fqHandle = ${shift(@_)}; |
|
return if(eof($fqHandle)); |
|
my $seqHeader = <$fqHandle>; |
|
chomp $seqHeader; |
|
|
|
$seqHeader = substr($seqHeader,1); |
|
my $seq = <$fqHandle>; |
|
|
|
chomp $seq; |
|
my $qualHeader = <$fqHandle>; |
|
|
|
my $qual = <$fqHandle>; |
|
chomp $qual; |
|
|
|
my $fastq; |
|
$fastq->[1]=$seq; |
|
$fastq->[0]=$seqHeader; |
|
$fastq->[2]=$qual; |
|
|
|
#warn $fastq->[0]; |
|
|
|
return $fastq; |
|
} |
|
|
|
sub WriteFastq { |
|
my $fqHandle = ${shift(@_)}; |
|
my $fq = shift @_; |
|
|
|
print $fqHandle "\@".$fq->[0]."\n"; |
|
print $fqHandle $fq->[1]."\n"; |
|
print $fqHandle "\+\n"; |
|
print $fqHandle $fq->[2]."\n"; |
|
|
|
#warn "\@".$fq->[0]."\n"; |
|
#print $fqHandle $fq->[1]."\n"; |
|
#print $fqHandle "\+\n"; |
|
#print $fqHandle $fq->[2]."\n"; |
|
} |
|
|
|
sub TestQual { |
|
#work in progress |
|
my $qual=shift @_; |
|
my $fq1=shift @_; |
|
my $fq2; |
|
$fq2=shift @_ if(scalar(@_)); |
|
die '[FATAL] Work in progress. Raise this as an issue, fix it yourself as commit or use alternative software like trimGalore/cutadapt.' if ($qual); |
|
} |