Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Last active August 30, 2017 06:57
Show Gist options
  • Save mmterpstra/2417200a96f841862c82220893490202 to your computer and use it in GitHub Desktop.
Save mmterpstra/2417200a96f841862c82220893490202 to your computer and use it in GitHub Desktop.
Split fastq by sequence. if you ligated different inserts to eachother them this script can split the reads into separate reads. tags: TLA DEEPSAGE SAGE HTS NGS

###splitfqbyCATG.pl Splits reads by CA-TG Sequence. Due to PE difficulties writes to single file.

#Install needs gzip and perl as prerequisites. On ubuntu installation is as easy as sudo apt install gzip perl && wget https://gist.githubusercontent.com/mmterpstra/2417200a96f841862c82220893490202/raw/b4172b5f907c46eb862c5387abca168fd6583388/splitfqbyCATG.pl

#Use perl splitfqbyCATG.pl [-q 0 ] reads_1.in.fq.gz reads.out.fq.gz [reads_2.in.fq.gz ]

###Example 1

Use on single end data

perl splitfqbyCATG.pl  R1_001.fastq.gz fastq.out.fq.gz

###Example 2

Use on paired end data

perl splitfqbyCATG.pl  R1_001.fastq.gz fastq.out.fq.gz R2_001.fastq.gz

##Notes

###Preprocessing

For paired end data I would advise using trimgalore + eautils fastqjoin. This because of duplicated sequences in overlapping parts of the paired end reads:

#before 
##pair1
>AAAAAA
     AAAAAA<
##pair2
      >AAAAAAAAAAadapter
adapterAAAAAAAAAA<

#after
##pair1
>AAAAAAAAAA
##pair2
>AAAAAAAAAA

###splitfqbyseq sidenote

splitfqbyCATG.pl = tested: splits by TLA sequence and using the correct assignment of the ligation overhangs.

splitfqbyseq.pl = untested: rework of original idea of splitting a fastq file by a certain sequence. Test for yourself and comment if you would like.

###debug

Statements for helping with debugging / better tracking of the reads:

#Debug:
gzip -dc in.fastq.gz| perl -wpe 's/ /\-/g'| gzip -c | perl splitfqbyCATG.pl  - /dev/stdout | gzip -dc | head -n 500000 > /dev/null
gzip -dc in.fastq.gz| perl -wpe 's/ /\-/g'| gzip -c | perl splitfqbyCATG.pl  - /dev/stdout | gzip -dc | head -n 40
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dumper;
use Getopt::Std;
use Scalar::Util qw(looks_like_number);
&main();
sub main {
#use List::Util qw/max/; my $in;
my $use="$0 [-q 0 ] reads_1.in.fq.gz reads.out.fq.gz [reads_2.in.fq.gz ] .
Splits reads by CA-TG Sequence. Due to PE difficulties writes to single file. PE is experimental.\n";
#open/check some required files
my $opts;
getopts('q:', \%{$opts});
die $use."\n" if(not(scalar(@ARGV) == 2 || (scalar(@ARGV) == 3)));
die "[FATAL] if '-q INT' is specified use integer for INT".$use if(defined($opts -> {'q'}) && not(looks_like_number($opts -> {'q'})));
$opts -> {'a'} = 'CATG';
die "[FATAL] linkersequence does not match [ATCGNatcgn]" if(not($opts -> {'a'} =~ m/^[ATCGNatcgn]+$/));
#se
#open handles fastq files
open(my $fastq1Handle,"-|",'gzip -dc '.$ARGV[0])
or die "[FATAL] Cannot read open fq1 file ".$ARGV[0];
defined($ARGV[1]) or die "Outfile reads_1.out.fq.gz not specified!";
open(my $fastq1OutHandle,"|-",'gzip -c > '.$ARGV[1])
or die "[FATAL] Cannot write fq1 file ".$ARGV[1];
my $fastq2Handle;
my $fastq2OutHandle;
if(defined($ARGV[2]) && -e $ARGV[2] && defined($ARGV[3])){
open($fastq2Handle,"-|",'gzip -dc '.$ARGV[2])
or die "Cannot read open fq2 file ".$ARGV[2];
defined($ARGV[3]) or die "Outfile reads_2.out.fq.gz not specified!";
}
warn "## ". localtime(time()). " ## INFO ## Starting iteration.\n";
my $timelast = time();
my $stats;
#if reading record fails from both fq files then end
while( ( (not(defined($ARGV[2]))) || (my $fq2 = ReadFastq(\$fastq2Handle)) ) && (my $fq1 = ReadFastq(\$fastq1Handle)) ){
#progress
if($. % 1000 == 0){
my $time = time();
if($time % 20 == 0 && $time != $timelast){
$timelast = $time;
warn "## ". localtime($time). " ## INFO ## Currently at line ". $. ."\n";
}
}
$stats -> {'recordcount'}++;
if(not(defined($opts -> {'q'})) || TestQual($opts -> {'q'},$fq1)){
$stats -> {'passcount'}++;
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq1);
if(defined($ARGV[3]) && -e $ARGV[3]){
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq2);
}
}
}
if( ( (defined($ARGV[2]) && -e $ARGV[2] ) && (my $fq2 = ReadFastq(\$fastq2Handle)) ) || (my $fq1 = ReadFastq(\$fastq1Handle)) ){
unlink($ARGV[1])if(-e $ARGV[1]);
unlink($ARGV[3])if(-e $ARGV[3]);
die "Some fastq handles are still readable:This program produced invalid output. Check your fastq files!";
}
warn "## ". localtime(time()). " ## INFO ## Done with " . $stats -> {'recordcount'} . " records processed and " .$stats -> {'passcount'} . " records passed at line ". $. ."\n";
}
sub SplitFqByLinker {
my $handle = shift @_;
my $linker = shift @_;
my $fq = shift @_;
#my $fq2;
#$fq2 = shift @_ if(scalar(@_)>0);
#index print in header just to get good data
my $fqLength = length($fq -> [1]);
my $fqBase;
my $fqInitLength;
if ($fq -> [0] =~ m/(.*)_(\d+)$/){
$fqBase=$1;
$fqInitLength=$2;
}else{
$fqInitLength= $fqLength;
$fqBase = $fq -> [0];
$fq -> [0] = $fq -> [0] . '_' . $fqLength;
}
my $index = index($fq ->[1],$linker);
if( $index==-1){
WriteFastq($handle , $fq) if($fqLength > 0);
}elsif($index>-1){
my $fqs;#fqsubset
#$fqs ->[0] = $fq ->[0];
#$fqs ->[0] = $fqBase.'_'. ($fqInitLenght - )+ $index;
$fqs->[1]=substr($fq ->[1],0,$index + 2);
$fqs->[2]=substr($fq ->[2],0,$index + 2);
$fq->[1]=substr($fq ->[1],$index + 2);
$fq->[2]=substr($fq ->[2],$index + 2);
$fqs ->[0] = $fqBase.'_'. (($fqInitLength - $fqLength) + $index);
WriteFastq($handle , $fqs) if (length($fqs->[1])>0);
SplitFqByLinker($handle,$linker,$fq);
}
}
sub ReadFastq {
my $fqHandle = ${shift(@_)};
return if(eof($fqHandle));
my $seqHeader = <$fqHandle>;
chomp $seqHeader;
$seqHeader = substr($seqHeader,1);
my $seq = <$fqHandle>;
chomp $seq;
my $qualHeader = <$fqHandle>;
my $qual = <$fqHandle>;
chomp $qual;
my $fastq;
$fastq->[1]=$seq;
$fastq->[0]=$seqHeader;
$fastq->[2]=$qual;
#warn $fastq->[0];
return $fastq;
}
sub WriteFastq {
my $fqHandle = ${shift(@_)};
my $fq = shift @_;
print $fqHandle "\@".$fq->[0]."\n";
print $fqHandle $fq->[1]."\n";
print $fqHandle "\+\n";
print $fqHandle $fq->[2]."\n";
#warn "\@".$fq->[0]."\n";
#print $fqHandle $fq->[1]."\n";
#print $fqHandle "\+\n";
#print $fqHandle $fq->[2]."\n";
}
sub TestQual {
#work in progress
my $qual=shift @_;
my $fq1=shift @_;
my $fq2;
$fq2=shift @_ if(scalar(@_));
die '[FATAL] Work in progress. Raise this as an issue, fix it yourself as commit or use alternative software like trimGalore/cutadapt.' if ($qual);
}
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dumper;
use Getopt::Std;
use Scalar::Util qw(looks_like_number);
&main();
sub main {
#use List::Util qw/max/; my $in;
my $use="$0 [-q 0 ] -a LINKER reads_1.in.fq.gz reads.out.fq.gz [reads_2.in.fq.gz ] .
Splits reads by LINKER Sequence. Due to PE difficulties writes to single file. PE is experimental.\n";
#open/check some required files
my $opts;
getopts('q:a:', \%{$opts});
die $use."\n" if(not(scalar(@ARGV) == 2 || (scalar(@ARGV) == 3)));
die "[FATAL] if '-q INT' is specified use integer for INT".$use if(defined($opts -> {'q'}) && not(looks_like_number($opts -> {'q'})));
die "[FATAL] linkersequence does not match [ATCGNatcgn]" if(not($opts -> {'a'} =~ m/^[ATCGNatcgn]+$/));
#se
#open handles fastq files
open(my $fastq1Handle,"-|",'gzip -dc '.$ARGV[0])
or die "[FATAL] Cannot read open fq1 file ".$ARGV[0];
defined($ARGV[1]) or die "Outfile reads_1.out.fq.gz not specified!";
open(my $fastq1OutHandle,"|-",'gzip -c > '.$ARGV[1])
or die "[FATAL] Cannot write fq1 file ".$ARGV[1];
my $fastq2Handle;
my $fastq2OutHandle;
if(defined($ARGV[2]) && -e $ARGV[2] && defined($ARGV[3])){
open($fastq2Handle,"-|",'gzip -dc '.$ARGV[2])
or die "Cannot read open fq2 file ".$ARGV[2];
defined($ARGV[3]) or die "Outfile reads_2.out.fq.gz not specified!";
}
warn "## ". localtime(time()). " ## INFO ## Starting iteration.\n";
my $timelast = time();
my $stats;
#if reading record fails from both fq files then end
while( ( (not(defined($ARGV[2]))) || (my $fq2 = ReadFastq(\$fastq2Handle)) ) && (my $fq1 = ReadFastq(\$fastq1Handle)) ){
#progress
if($. % 1000 == 0){
my $time = time();
if($time % 20 == 0 && $time != $timelast){
$timelast = $time;
warn "## ". localtime($time). " ## INFO ## Currently at line ". $. ."\n";
}
}
$stats -> {'recordcount'}++;
if(not(defined($opts -> {'q'})) || TestQual($opts -> {'q'},$fq1)){
$stats -> {'passcount'}++;
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq1);
if(defined($ARGV[3]) && -e $ARGV[3]){
SplitFqByLinker(\$fastq1OutHandle, $opts -> {'a'}, $fq2);
}
}
}
if( ( (defined($ARGV[2]) && -e $ARGV[2] ) && (my $fq2 = ReadFastq(\$fastq2Handle)) ) || (my $fq1 = ReadFastq(\$fastq1Handle)) ){
unlink($ARGV[1])if(-e $ARGV[1]);
unlink($ARGV[3])if(-e $ARGV[3]);
die "Some fastq handles are still readable:This program produced invalid output. Check your fastq files!";
}
warn "## ". localtime(time()). " ## INFO ## Done with " . $stats -> {'recordcount'} . " records processed and " .$stats -> {'passcount'} . " records passed at line ". $. ."\n";
}
sub SplitFqByLinker {
my $handle = shift @_;
my $linker = shift @_;
my $fq = shift @_;
#my $fq2;
#$fq2 = shift @_ if(scalar(@_)>0);
#index print in header just to get good data
my $fqLength = length($fq -> [1]);
my $fqBase;
my $fqInitLength;
if ($fq -> [0] =~ m/(.*)_(\d+)$/){
$fqBase=$1;
$fqInitLength=$2;
}else{
$fqInitLength= $fqLength;
$fqBase = $fq -> [0];
$fq -> [0] = $fq -> [0] . '_' . $fqLength;
}
my $index = index($fq ->[1],$linker);
if( $index==-1){
WriteFastq($handle , $fq) if($fqLength > 0);
}elsif($index>-1){
my $fqs;#fqsubset
#$fqs ->[0] = $fq ->[0];
#$fqs ->[0] = $fqBase.'_'. ($fqInitLenght - )+ $index;
$fqs->[1]=substr($fq ->[1],0,$index);
$fqs->[2]=substr($fq ->[2],0,$index);
$fq->[1]=substr($fq ->[1],$index + length($linker));
$fq->[2]=substr($fq ->[2],$index + length($linker));
$fqs ->[0] = $fqBase.'_'. (($fqInitLength - $fqLength) + $index);
WriteFastq($handle , $fqs) if (length($fqs->[1])>0);
SplitFqByLinker($handle,$linker,$fq);
}
}
sub ReadFastq {
my $fqHandle = ${shift(@_)};
return if(eof($fqHandle));
my $seqHeader = <$fqHandle>;
chomp $seqHeader;
$seqHeader = substr($seqHeader,1);
my $seq = <$fqHandle>;
chomp $seq;
my $qualHeader = <$fqHandle>;
my $qual = <$fqHandle>;
chomp $qual;
my $fastq;
$fastq->[1]=$seq;
$fastq->[0]=$seqHeader;
$fastq->[2]=$qual;
#warn $fastq->[0];
return $fastq;
}
sub WriteFastq {
my $fqHandle = ${shift(@_)};
my $fq = shift @_;
print $fqHandle "\@".$fq->[0]."\n";
print $fqHandle $fq->[1]."\n";
print $fqHandle "\+\n";
print $fqHandle $fq->[2]."\n";
#warn "\@".$fq->[0]."\n";
#print $fqHandle $fq->[1]."\n";
#print $fqHandle "\+\n";
#print $fqHandle $fq->[2]."\n";
}
sub TestQual {
#work in progress
my $qual=shift @_;
my $fq1=shift @_;
my $fq2;
$fq2=shift @_ if(scalar(@_));
die '[FATAL] Work in progress. Raise this as an issue, fix it yourself as commit or use alternative software like trimGalore/cutadapt.' if ($qual);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment