Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Last active January 18, 2017 14:05
Show Gist options
  • Save mmterpstra/c40a952ef10512de8815584b867bfebb to your computer and use it in GitHub Desktop.
Save mmterpstra/c40a952ef10512de8815584b867bfebb to your computer and use it in GitHub Desktop.
Downsample Fastq data
set -e
set -x
set -o pipefail
TMP00=./results/tmp/
RAWDIR=/path/to/fastq/files
ml picard/2.2.2-foss-2016a-Java-1.8.0_74
(for fastq1 in ${RAWDIR}/*.fastq.gz; do
echo "## INFO ##"$(date)" ## file '"$fastq1"' start"
fastq2=$(dirname $fastq1)/$(basename $fastq1 _R1_001.fastq.gz)"_R3_001.fastq.gz"
rbcfastq=$(dirname $fastq1)/$(basename $fastq1 _R1_001.fastq.gz)"_R2_001.fastq.gz"
mkdir -p $TMP00/projects/downsample/paddedfq/
tmpFastq1=$TMP00"/projects/downsample/paddedfq/"$(basename $fastq1)
tmpFastq2=$TMP00"/projects/downsample/paddedfq/"$(basename $fastq2)
#padding + random barcode in flowcell name
ml DigitalBarcodeReadgroups/0.1.5-foss-2016a-Perl-5.20.2-bare
gzip -dc $fastq1 | \
perl -wpe 'if($.%4==2 && length($_) < 10){chomp $_;$_ .= "N" x (10 - length($_))."\n"}elsif($.%4==0 && length($_) < 10){chomp $_;$_ .= "#" x (10 - length($_))."\n"}' | \
gzip | perl $EBROOTDIGITALBARCODEREADGROUPS/src/NugeneMergeFastqFiles.pl $rbcfastq ./ -
mv -v -- "-.fq.gz" $tmpFastq1
gzip -cd $fastq2 | \
perl -wpe 'if($.%4==2 && length($_) < 10){chomp $_;$_ .= "N" x (10 - length($_))."\n"}elsif($.%4==0 && length($_) < 10){chomp $_;$_ .= "#" x (10 - length($_))."\n"}' | \
gzip | perl $EBROOTDIGITALBARCODEREADGROUPS/src/NugeneMergeFastqFiles.pl $rbcfastq ./ -
mv -v -- "-.fq.gz" $tmpFastq2
#fastq to sam
mkdir -p $TMP00/projects/downsample/ubam/
tmpBam=$TMP00/projects/downsample/ubam/$(basename $fastq1 _R1_001.fastq.gz).bam
java -jar ${EBROOTPICARD}/picard.jar FastqToSam F1=$tmpFastq1 F2=$tmpFastq2 O=$tmpBam SAMPLE_NAME="fastq" SORT_ORDER=queryname;
#downsample 4 times and different amount of reads
( for run in "1" "2" "3" "4"; do
for n in "1500000" "1000000" "200000" "400000" "100000"; do
(
mkdir -p $TMP00/projects/downsample/r${run}n${n}/
tmpDownsampledBam=$TMP00/projects/downsample/r${run}n${n}/$(basename $fastq1 _R1_001.fastq.gz).bam
fqLines=$(gzip -dc $fastq1| wc -l)
#downsamplesam
java -jar ${EBROOTPICARD}/picard.jar DownsampleSam I=$tmpBam O=$tmpDownsampledBam PROBABILITY=$(perl -we 'my $fqlines=shift @ARGV; my $fqReads=$fqlines/4;my $targetReads=shift @ARGV;print $targetReads/$fqReads;' $fqLines $n)
#samtofastq
downsampledFastq1gz=$TMP00/projects/downsample/r${run}n${n}/$(basename $fastq1)
downsampledFastq2gz=$TMP00/projects/downsample/r${run}n${n}/$(basename $fastq2)
java -jar ${EBROOTPICARD}/picard.jar SamToFastq I=$tmpDownsampledBam FASTQ=$downsampledFastq1gz SECOND_END_FASTQ=$downsampledFastq2gz
gzip -dc $downsampledFastq1gz | perl -wne 'if($. % 4 == 1){print $_; my $seq = $_; $seq =~ s/.*_([ATCGN]{6}).*/$1/; print $seq."+\n"."B" x 6 . "\n"}' | gzip > $TMP00/projects/downsample/r${run}n${n}/$(basename $rbcfastq)
)&
done
wait
done)
echo "## INFO ##"$(date)" ## file '"$fastq1"' done"
done)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment