Created
March 7, 2018 23:10
-
-
Save clintval/4b9f5af8b536312e042a0e393b67a6a2 to your computer and use it in GitHub Desktop.
Academic DCS pipeline v2.0 from DOI:10.1073/pnas.1700759114
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin.sh | |
| # request Bourne shell as shell for job | |
| #$ -S /bin/sh | |
| #$ -cwd | |
| #$ -V | |
| #$ -m e | |
| #$ -pe whole_nodes 7 | |
| #$ -M [email protected] | |
| ############################### | |
| module add bwa/0.6.1 | |
| module add samtools/0.1.19 | |
| module add python/2.7.2 | |
| module add numpy/1.6.1 | |
| module add biopython/1.64 | |
| module add pysam/0.7.4 | |
| module add jre/1.7.0-25 | |
| # DS bash script | |
| # Version 2.1 | |
| #Library Prep (nt): | |
| # barcode: 8 | |
| # spacer: 5 | |
| # rlength: 150 | |
| # *rlength: 150 - 8 - 5 = 137 | |
| #Post-Processing: | |
| # Left-side Clip: 4 | |
| # Right-side Clip: 21 | |
| # rlength: 137 - 4 - 21 = 112 | |
| # Step 1: Setup variables for run: | |
| clear | |
| # Set up error checking | |
| # Stop on any error | |
| set -e | |
| # Stop on an error inside a pipeline | |
| # set -o pipefail # not permissable on ROUS server | |
| # Throw an error on calling an unassigned variable | |
| set -u | |
| #DEFAULTS | |
| DSpath='' | |
| alignRef='' | |
| runIdentifier='' | |
| read1in=seq1.fq | |
| read2in=seq2.fq | |
| iSize=-1 | |
| minMem=3 | |
| maxMem=1000 | |
| cutOff=0.7 | |
| nCutOff=0.3 | |
| readLength=150 | |
| barcodeLength=8 | |
| spacerLength=5 | |
| filtersSet='os' | |
| readTypes='dpm' | |
| repFilt=9 | |
| readOut=1000000 | |
| #POST_PROCESSING_PATHS | |
| picard=~/picard-tools-1.119 | |
| GATK=~/GATK | |
| #NONDEFAULTS | |
| #FINAL_READ_LENGTH 150-8-5 = 137 | |
| readLength=$((readLength-barcodeLength-spacerLength)) | |
| #LOG_FILE_NAME | |
| logFile=${runIdentifier}.log.txt | |
| #Export all variables | |
| export DSpath | |
| export alignRef | |
| export runIdentifier | |
| export read1in | |
| export read2in | |
| export iSize | |
| export minMem | |
| export maxMem | |
| export cutOff | |
| export nCutOff | |
| export readLength | |
| export barcodeLength | |
| export spacerLength | |
| export filtersSet | |
| export readTypes | |
| export repFilt | |
| export readOut | |
| export picard | |
| export GATK | |
| # Print out options used to log file | |
| touch $logFile | |
| echo "Run identifier: " $runIdentifier | tee -a ${logFile} | |
| echo "Program path: " $DSpath | tee -a ${logFile} | |
| echo "Reference genome: " $alignRef | tee -a ${logFile} | |
| echo "Barcode length: " $barcodeLength | tee -a ${logFile} | |
| echo "Spacer length: " $spacerLength | tee -a ${logFile} | |
| echo "Post-tag_to_header read length: " $readLength | tee -a ${logFile} | |
| echo "Repetitive tag filter length: " $repFilt | tee -a ${logFile} | |
| echo "Minimum family size: " $minMem | tee -a ${logFile} | |
| echo "Maximum family size: " $maxMem | tee -a ${logFile} | |
| echo "Consensus cutoff: " $cutOff | tee -a ${logFile} | |
| echo "Consensus N cutoff: " $nCutOff | tee -a ${logFile} | |
| echo "Read types: " $readTypes | tee -a ${logFile} | |
| echo "Filters: " $filtersSet | tee -a ${logFile} | |
| echo "" | tee -a ${logFile} | |
| # Step 2: Run tag_to_header.py on input files | |
| echo "Starting Run" | tee -a ${logFile} | |
| echo "tag_to_header starting" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| echo "" | tee -a ${logFile} | |
| python ${DSpath}/tag_to_header.py \ | |
| --infile1 $read1in \ | |
| --infile2 $read2in \ | |
| --outfile1 ${runIdentifier}.seq1.fq.smi \ | |
| --outfile2 ${runIdentifier}.seq2.fq.smi \ | |
| --barcode_length $barcodeLength \ | |
| --spacer_length $spacerLength \ | |
| --read_out $readOut | |
| # Step 3: Align sequences | |
| echo "Aligning with BWA" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| bwa aln $alignRef ${runIdentifier}.seq1.fq.smi > ${runIdentifier}.seq1.aln | |
| bwa aln $alignRef ${runIdentifier}.seq2.fq.smi > ${runIdentifier}.seq2.aln | |
| bwa sampe \ | |
| -s $alignRef \ | |
| ${runIdentifier}.seq1.aln \ | |
| ${runIdentifier}.seq2.aln \ | |
| ${runIdentifier}.seq1.fq.smi \ | |
| ${runIdentifier}.seq2.fq.smi \ | |
| > ${runIdentifier}.pe.sam | |
| # Step 4: Sort aligned sequences | |
| echo "Sorting aligned sequences" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| samtools view -Sbu ${runIdentifier}.pe.sam \ | |
| | samtools sort - ${runIdentifier}.pe.sort | |
| # Step 5: Run Consensus Maker | |
| echo "Starting Consensus Maker" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| python ${DSpath}/ConsensusMaker.py \ | |
| --infile ${runIdentifier}.pe.sort.bam \ | |
| --tagfile ${runIdentifier}.pe.tagcounts \ | |
| --outfile ${runIdentifier}.sscs.bam \ | |
| --minmem $minMem \ | |
| --maxmem $maxMem \ | |
| --readlength $readLength \ | |
| --cutoff $cutOff \ | |
| --Ncutoff $nCutOff \ | |
| --read_type $readTypes \ | |
| --filt $filtersSet \ | |
| --isize $iSize | |
| # Step 6: Sort SSCSs | |
| echo "Sorting SSCSs" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| samtools view -bu ${runIdentifier}.sscs.bam \ | |
| | samtools sort - ${runIdentifier}.sscs.sort.bam | |
| # Step 7: Index sorted SSCSs | |
| echo "Indexing sorted SSCSs" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| samtools index ${runIdentifier}.sscs.sort.bam | |
| # Step 8: Run Duplex Maker | |
| echo "Starting Duplex Maker" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| python ${DSpath}/DuplexMaker.py \ | |
| --infile ${runIdentifier}.sscs.sort.bam \ | |
| --outfile ${runIdentifier}.dcs.bam \ | |
| --Ncutoff $nCutOff \ | |
| --readlength $readLength \ | |
| --barcode_length $barcodeLength | |
| # Step 9: Align DCSs | |
| echo "Aligning DCSs" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| bwa aln $alignRef ${runIdentifier}.dcs.r1.fq > ${runIdentifier}.dcs.r1.aln | |
| bwa aln $alignRef ${runIdentifier}.dcs.r2.fq > ${runIdentifier}.dcs.r2.aln | |
| bwa sampe \ | |
| -s $alignRef ${runIdentifier}.dcs.r1.aln \ | |
| ${runIdentifier}.dcs.r2.aln ${runIdentifier}.dcs.r1.fq \ | |
| ${runIdentifier}.dcs.r2.fq \ | |
| > ${runIdentifier}.dcs.sam | |
| # Step 10: Sort aligned DCSs | |
| echo "Sorting aligned DCSs" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| samtools view -Sbu ${runIdentifier}.dcs.sam \ | |
| | samtools sort - ${runIdentifier}.dcs.aln.sort | |
| # Step 11: Index sorted DCSs | |
| echo "Indexing sorted DCSs" | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| samtools index ${runIdentifier}.dcs.aln.sort.bam | |
| echo "Finished with run " $runIdentifier \ | |
| | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| echo "Attempting post-processing for: " $runIdentifier \ | |
| | tee -a ${logFile} | |
| date | tee -a ${logFile} | |
| ############################################################################### | |
| # POST_PROCESSING | |
| ############################################################################### | |
| # Setup | |
| processFile=${runIdentifier}.dcs.aln.sort.bam | |
| minDepth=100 | |
| minClonality=0 | |
| maxClonality=0.01 | |
| export processFile | |
| export minDepth | |
| export minClonality | |
| export maxClonality | |
| echo "File for processing: " $processFile | tee -a ${logFile} | |
| echo "Minimum Depth: " $minDepth | tee -a ${logFile} | |
| echo "Minimum Clonality: " $minClonality | tee -a ${logFile} | |
| echo "Maximum Clonality: " $maxClonality | tee -a ${logFile} | |
| # Filter for mapping reads | |
| echo "Filtering" | tee -a ${logFile} | |
| samtools view \ | |
| -F 4 \ | |
| -b ${runIdentifier}.dcs.aln.sort.bam \ | |
| > ${runIdentifier}.dcs.filt.bam | |
| # Clipping final file | |
| echo "Clipping Final File" | tee -a ${logFile} | |
| java -jar $picard/AddOrReplaceReadGroups.jar \ | |
| INPUT=${runIdentifier}.dcs.filt.bam \ | |
| OUTPUT=${runIdentifier}.dcs.filt.readgroups.bam \ | |
| RGLB=UW \ | |
| RGPL=Illumina \ | |
| RGPU=ATATAT \ | |
| RGSM=default | |
| samtools index ${runIdentifier}.dcs.filt.readgroups.bam | |
| java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \ | |
| -T RealignerTargetCreator \ | |
| -R $alignRef \ | |
| -I ${runIdentifier}.dcs.filt.readgroups.bam \ | |
| -o ${runIdentifier}.dcs.filt.readgroups.intervals | |
| java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \ | |
| -T IndelRealigner \ | |
| -R $alignRef \ | |
| -I ${runIdentifier}.dcs.filt.readgroups.bam \ | |
| -targetIntervals ${runIdentifier}.dcs.filt.readgroups.intervals \ | |
| -o ${runIdentifier}.dcs.filt.readgroups.realign.bam | |
| java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \ | |
| -T ClipReads \ | |
| -I ${runIdentifier}.dcs.filt.readgroups.realign.bam \ | |
| -o ${runIdentifier}.dcs.filt.readgroups.clipped.bam \ | |
| -R $alignRef \ | |
| --cyclesToTrim "1-4,117-137" \ | |
| --clipRepresentation SOFTCLIP_BASES | |
| # 137-4-21 = 112 | |
| # Generating stats from final file | |
| # -B (--no-BAQ) Disable per-Base Alignment Quality... Qualities exist as ASCII J | |
| # -A (--count-orphans) Do not discard anmalous read pairs | |
| # -d 500000 (--max-depth INT) | |
| samtools mpileup \ | |
| -B \ | |
| -A \ | |
| -d 500000 \ | |
| -f $alignRef \ | |
| ${runIdentifier}.dcs.filt.readgroups.clipped.bam \ | |
| | tee ${runIdentifier}.dcs.pileup \ | |
| | python ${DSpath}/CountMuts.py \ | |
| -d $minDepth \ | |
| -c $minClonality \ | |
| -C $maxClonality \ | |
| | tee ${runIdentifier}.dcs.aln.sort.bam.d${minDepth}-c${minClonality}-${maxClonality}.unique.countmuts | |
| cat ${runIdentifier}.dcs.final.bam.pileup \ | |
| | python ${DSpath}/CountMuts.py \ | |
| -d $minDepth \ | |
| -c $minClonality \ | |
| -C $maxClonality \ | |
| > ${runIdentifier}.dcs.unique.countmuts | |
| python ${DSpath}/mut-position.py \ | |
| -i ${runIdentifier}.dcs.final.bam.pileup \ | |
| -o ${runIdentifier}.dcs.aln.sort.bam.d${minDepth}-c${minClonality}-${maxClonality}.mutpos \ | |
| -d $minDepth \ | |
| -c $minClonality \ | |
| -C $maxClonality | |
| echo "Finished post-processing for: " $runIdentifier \ | |
| | tee -a ${logFile} | |
| date | tee -a ${logFile} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment