This should show if there is an added yield in trimming nugene data by bedfile containing the landing probes. This applies tree different workflows for comparision as listed below.
- Trimbybed
- bbduk trim linker(hisat has diffculties with unaligned ends)
- hisat align
- trimbybed
- samtools rmdup
- picard metrics?
- Trimming1 simple
- bbduk trim linker(hisat has diffculties with unaligned ends)
- bbduk trim landing probe
- hisat align
- samtools sort/index
- samtools rmdup
- picard metrics?
- Trimming2
- bbduk trim linker+ landing probe
- hisat align
- samtools sort/index/rmdup
- samtools rmdup
- picard metrics?
till rmdup automated
(mkdir rmdup ;for i in trimByBed/*.bam ; do samtools sort -n $i | samtools fixmate - /dev/stdout| samtools sort - | samtools rmdup - /dev/stdout| tee rmdup/$( basename ${i}) | samtools index - rmdup/$(basename ${i} .bam ).bai; done) &> rmdup.log&
perl -wpe 'if($.%2==0){chomp;$_ = reverse($_); tr/ACTG/TGAC/; $_.="\n";}' /data/umcg-mterpstra/apps/data/resources/target_ET1508R_3a_782_3b_783_probeSeq.fasta > target_ET1508R_3a_782_3b_783_probeSeq.rc.fasta
( for i in ../*_1.fq.gz;do
echo $i
bash $EBROOTBBMAP/bbduk.sh -Xmx11g in=$i out=bbdukprobetrim_p1/$(basename $i) in2=$(dirname $i)/$(basename $i _1.fq.gz)_2.fq.gz out2=bbdukprobetrim_p1/$(basename $i _1.fq.gz)_2.fq.gz ref=/data/umcg-mterpstra/apps/data/resources/target_ET1508R_3a_782_3b_783_probeSeq.fasta hdist=1 ktrim=r rcomp=f k=31 mink=11 minlen=20 skipr2=t overwrite=true
bash $EBROOTBBMAP/bbduk.sh -Xmx11g in=bbdukprobetrim_p1/$(basename $i) out=bbdukprobetrim_p2/$(basename $i) in2=bbdukprobetrim_p1/$(basename $i _1.fq.gz)_2.fq.gz out2=bbdukprobetrim_p2/$(basename $i _1.fq.gz)_2.fq.gz ref=target_ET1508R_3a_782_3b_783_probeSeq.rc.fasta hdist=1 ktrim=l rcomp=f k=31 mink=11 minlen=20 skipr1=t overwrite=true
done ) &>bbdukprobetrim.log &
ml hisat2/2.0.4-foss-2016a-Python-2.7.11
(mkdir -p hisat2 ;for i in bbdukprobetrim_p2/*_1.fq.gz ; do hisat2 -x /data/umcg-mterpstra/apps/data//ftp.broadinstitute.org/bundle/2.8/b37//hisat2/2.0.3-beta-goolf-1.7.20/human_g1k_v37_decoy --known-splicesite-infile /data/umcg-mterpstra/apps/data//ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/hisat2-2.0.3-beta-foss-2016a/Homo_sapiens.GRCh37.75.splicesites.txt --score-min L,0,-0.6 --sp 1,1.5 -D 20 -R 3 -S hisat2/$(basename $i _1.fq.gz).sam --threads 1 -1 $i -2 $(dirname $i)/$(basename $i _1.fq.gz )_2.fq.gz; done) &> hisat2.log&
(for i in hisat2/*.sam; do echo $i;samtools sort -n $i | samtools fixmate - /dev/stdout | samtools sort - | samtools rmdup - /dev/stdout | samtools view -Sb -| tee rmdup/$(basename $i .sam).bam | samtools index - rmdup/$(basename $i .sam).bai; done) &> rmdup.log
paste <(grep 'in library' ../workflow_trimmming1/rmdup.log) <( grep 'in library' ../workflow_trimbybed/rmdup.log )
[bam_rmdup_core] 448313 / 558710 = 0.8024 in library ' ' [bam_rmdup_core] 455103 / 560426 = 0.8121 in library ' '
[bam_rmdup_core] 1719778 / 1915967 = 0.8976 in library ' ' [bam_rmdup_core] 1661811 / 1797921 = 0.9243 in library ' '
[bam_rmdup_core] 612195 / 717522 = 0.8532 in library ' ' [bam_rmdup_core] 727180 / 800850 = 0.9080 in library ' '
[bam_rmdup_core] 1835573 / 2049983 = 0.8954 in library ' ' [bam_rmdup_core] 1570763 / 1718325 = 0.9141 in library ' '
[bam_rmdup_core] 12976 / 19361 = 0.6702 in library ' ' [bam_rmdup_core] 11421 / 15557 = 0.7341 in library ' '
[bam_rmdup_core] 744007 / 902988 = 0.8239 in library ' ' [bam_rmdup_core] 691964 / 804402 = 0.8602 in library ' '
[bam_rmdup_core] 1698446 / 1837886 = 0.9241 in library ' ' [bam_rmdup_core] 1915840 / 2014275 = 0.9511 in library ' '
[bam_rmdup_core] 1333898 / 1477292 = 0.9029 in library ' ' [bam_rmdup_core] 1612840 / 1714238 = 0.9408 in library ' '
[bam_rmdup_core] 5548 / 7088 = 0.7827 in library ' ' [bam_rmdup_core] 5504 / 7004 = 0.7858 in library ' '
table
trimming | trimbybed | ||||
---|---|---|---|---|---|
dups | total | pctdups | dups | total | pctdups |
448313 | 558710 | 0.8024 | 455103 | 560426 | 0.8121 |
1719778 | 1915967 | 0.8976 | 1661811 | 1797921 | 0.9243 |
612195 | 717522 | 0.8532 | 727180 | 800850 | 0.9080 |
1835573 | 2049983 | 0.8954 | 1570763 | 1718325 | 0.9141 |
12976 | 19361 | 0.6702 | 11421 | 15557 | 0.7341 |
744007 | 902988 | 0.8239 | 691964 | 804402 | 0.8602 |
1698446 | 1837886 | 0.9241 | 1915840 | 2014275 | 0.9511 |
1333898 | 1477292 | 0.9029 | 1612840 | 1714238 | 0.9408 |
5548 | 7088 | 0.7827 | 5504 | 7004 | 0.7858 |
look at subcaptions.
#forward
bash $EBROOTBBMAP/bbduk.sh -Xmx11g in=/scratch/umcg-mterpstra/raw/160920_M01785_0358_000000000-AW3LK_ANNAPool3/T16-3666_S1_L001_R1_001.fastq.gz out=bbmap.R1.fq.gz in2=/scratch/umcg-mterpstra/raw/160920_M01785_0358_000000000-AW3LK_ANNAPool3/T16-3666_S1_L001_R3_001.fastq.gz out2=bbmap.R3.fq.gz ref=/data/umcg-mterpstra/apps/data/resources/target_ET1508R_3a_782_3b_783_probeSeq.fasta hdist=1 ktrim=r rcomp=f k=31 mink=11 minlen=20 skipr2=t
#reverse
perl -wpe 'if($.%2==0){chomp;$_ = reverse($_); tr/ACTG/TGAC/; $_.="\n";}' /data/umcg-mterpstra/apps/data/resources/target_ET1508R_3a_782_3b_783_probeSeq.fasta > target_ET1508R_3a_782_3b_783_probeSeq.rc.fasta
bash $EBROOTBBMAP/bbduk.sh -Xmx11g in=bbmap.R1.fq.gz out=bbmap2.R1.fq.gz in2=bbmap.R3.fq.gz out2=bbmap2.R3.fq.gz ref=target_ET1508R_3a_782_3b_783_probeSeq.rc.fasta hdist=1 ktrim=l rcomp=f k=31 mink=11 minlen=20 skipr1=t
hisat2 -x /data/umcg-mterpstra/apps/data//ftp.broadinstitute.org/bundle/2.8/b37//hisat2/2.0.3-beta-goolf-1.7.20/human_g1k_v37_decoy --known-splicesite-infile /data/umcg-mterpstra/apps/data//ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/hisat2-2.0.3-beta-foss-2016a/Homo_sapiens.GRCh37.75.splicesites.txt --score-min L,0,-0.6 --sp 1,1.5 -D 20 -R 3 -S test/S1_bbduk.sam --threads 1 -1 bbmap2.R1.fq.gz -2 bbmap2.R3.fq.gz
1288697 (100.00%) were paired; of these:
124677 (9.67%) aligned concordantly 0 times
572726 (44.44%) aligned concordantly exactly 1 time
591294 (45.88%) aligned concordantly >1 times
----
124677 pairs aligned concordantly 0 times; of these:
29550 (23.70%) aligned discordantly 1 time
----
95127 pairs aligned 0 times concordantly or discordantly; of these:
190254 mates make up the pairs; of these:
81134 (42.65%) aligned 0 times
34275 (18.02%) aligned exactly 1 time
74845 (39.34%) aligned >1 times
96.85% overall alignment rate
samtools sort -n TEST/out_bbduk.sam.bam | samtools fixmate - /dev/stdout | samtools sort - | samtools rmdup - TEST/out_bbduk.md.bam
...
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:1106:20192:13353'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:1106:20192:13353'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2109:20205:8855'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2111:10840:4623'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2111:10840:4623'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2111:12675:14976'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2106:11490:9048'. Continue anyway.
[bam_rmdup_core] inconsistent BAM file for pair 'M01785:358:000000000-AW3LK:1:2113:6030:13806'. Continue anyway.
[bam_rmdup_core] 23 unmatched pairs
[bam_rmdup_core] 1712259 / 1784607 = 0.9595 in library ' '
hisat2 -x /data/umcg-mterpstra/apps/data//ftp.broadinstitute.org/bundle/2.8/b37//hisat2/2.0.3-beta-goolf-1.7.20/human_g1k_v37_decoy --known-splicesite-infile /data/umcg-mterpstra/apps/data//ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/hisat2-2.0.3-beta-foss-2016a/Homo_sapiens.GRCh37.75.splicesites.txt --score-min L,0,-0.6 --sp 1,1.5 -D 20 -R 3 -S test/S1.sam --threads 1 -1 /scratch/umcg-mterpstra/raw/160920_M01785_0358_000000000-AW3LK_ANNAPool3/T16-3666_S1_L001_R1_001.fastq.gz -2 /scratch/umcg-mterpstra/raw/160920_M01785_0358_000000000-AW3LK_ANNAPool3/T16-3666_S1_L001_R3_001.fastq.gz
1490590 reads; of these:
1490590 (100.00%) were paired; of these:
138342 (9.28%) aligned concordantly 0 times
599122 (40.19%) aligned concordantly exactly 1 time
753126 (50.53%) aligned concordantly >1 times
----
138342 pairs aligned concordantly 0 times; of these:
1803 (1.30%) aligned discordantly 1 time
----TEST/out.sam
136539 pairs aligned 0 times concordantly or discordantly; of these:
273078 mates make up the pairs; of these:
165230 (60.51%) aligned 0 times
22876 (8.38%) aligned exactly 1 time
84972 (31.12%) aligned >1 times
94.46% overall alignment rate
wait && perl bin/trimByBed.pl -s test/S1.sam -b /data/umcg-mterpstra/apps/data/resources/probeSeq_RDonly_ET1262F_2_182.bed -o TEST/ -n TEST/out.sam &>/dev/stdout | tail -n 1000
samtools sort -n TEST/out.sam.bam | samtools fixmate - /dev/stdout | samtools sort - | samtools rmdup - TEST/out.md.bam
[bam_sort_core] merging from 2 files...
[bam_sort_core] merging from 2 files...
[bam_rmdup_core] processing reference 1...
[bam_rmdup_core] 5 unmatched pairs
[bam_rmdup_core] processing reference 2...
[bam_rmdup_core] 19 unmatched pairs
[bam_rmdup_core] processing reference 3...
[bam_rmdup_core] 3 unmatched pairs
[bam_rmdup_core] processing reference 4...
[bam_rmdup_core] 33 unmatched pairs
[bam_rmdup_core] processing reference 5...
[bam_rmdup_core] 138 unmatched pairs
[bam_rmdup_core] processing reference 6...
[bam_rmdup_core] 63 unmatched pairs
[bam_rmdup_core] processing reference 7...
[bam_rmdup_core] 5 unmatched pairs
[bam_rmdup_core] processing reference 8...
[bam_rmdup_core] 7 unmatched pairs
[bam_rmdup_core] processing reference 9...
[bam_rmdup_core] 3 unmatched pairs
[bam_rmdup_core] processing reference 10...
[bam_rmdup_core] 118 unmatched pairs
[bam_rmdup_core] processing reference 11...
[bam_rmdup_core] processing reference 12...
[bam_rmdup_core] 7 unmatched pairs
[bam_rmdup_core] processing reference 13...
[bam_rmdup_core] processing reference 14...
[bam_rmdup_core] 13 unmatched pairs
[bam_rmdup_core] processing reference 15...
[bam_rmdup_core] 3 unmatched pairs
[bam_rmdup_core] processing reference 16...
[bam_rmdup_core] 1 unmatched pairs
[bam_rmdup_core] processing reference 17...
[bam_rmdup_core] processing reference 18...
[bam_rmdup_core] processing reference 19...
[bam_rmdup_core] processing reference 20...
[bam_rmdup_core] 33 unmatched pairs
[bam_rmdup_core] processing reference 21...
[bam_rmdup_core] 1 unmatched pairs
[bam_rmdup_core] processing reference 22...
[bam_rmdup_core] 1 unmatched pairs
[bam_rmdup_core] processing reference X...
[bam_rmdup_core] 4 unmatched pairs
[bam_rmdup_core] processing reference Y...
[bam_rmdup_core] processing reference MT...
[bam_rmdup_core] 1 unmatched pairs
[bam_rmdup_core] processing reference GL000229.1...
[bam_rmdup_core] processing reference GL000231.1...
[bam_rmdup_core] processing reference GL000220.1...
[bam_rmdup_core] 13 unmatched pairs
[bam_rmdup_core] processing reference GL000219.1...
[bam_rmdup_core] processing reference GL000194.1...
[bam_rmdup_core] processing reference GL000192.1...
[bam_rmdup_core] processing reference hs37d5...
[bam_rmdup_core] 9 unmatched pairs
[bam_rmdup_core] 1311949 / 1362602 = 0.9628 in library ' '