Skip to content

Instantly share code, notes, and snippets.

@clintval
Created March 7, 2018 23:10
Show Gist options
  • Select an option

  • Save clintval/4b9f5af8b536312e042a0e393b67a6a2 to your computer and use it in GitHub Desktop.

Select an option

Save clintval/4b9f5af8b536312e042a0e393b67a6a2 to your computer and use it in GitHub Desktop.
Academic DCS pipeline v2.0 from DOI:10.1073/pnas.1700759114
#!/bin.sh
# request Bourne shell as shell for job
#$ -S /bin/sh
#$ -cwd
#$ -V
#$ -m e
#$ -pe whole_nodes 7
#$ -M [email protected]
###############################
module add bwa/0.6.1
module add samtools/0.1.19
module add python/2.7.2
module add numpy/1.6.1
module add biopython/1.64
module add pysam/0.7.4
module add jre/1.7.0-25
# DS bash script
# Version 2.1
#Library Prep (nt):
# barcode: 8
# spacer: 5
# rlength: 150
# *rlength: 150 - 8 - 5 = 137
#Post-Processing:
# Left-side Clip: 4
# Right-side Clip: 21
# rlength: 137 - 4 - 21 = 112
# Step 1: Setup variables for run:
clear
# Set up error checking
# Stop on any error
set -e
# Stop on an error inside a pipeline
# set -o pipefail # not permissable on ROUS server
# Throw an error on calling an unassigned variable
set -u
#DEFAULTS
DSpath=''
alignRef=''
runIdentifier=''
read1in=seq1.fq
read2in=seq2.fq
iSize=-1
minMem=3
maxMem=1000
cutOff=0.7
nCutOff=0.3
readLength=150
barcodeLength=8
spacerLength=5
filtersSet='os'
readTypes='dpm'
repFilt=9
readOut=1000000
#POST_PROCESSING_PATHS
picard=~/picard-tools-1.119
GATK=~/GATK
#NONDEFAULTS
#FINAL_READ_LENGTH 150-8-5 = 137
readLength=$((readLength-barcodeLength-spacerLength))
#LOG_FILE_NAME
logFile=${runIdentifier}.log.txt
#Export all variables
export DSpath
export alignRef
export runIdentifier
export read1in
export read2in
export iSize
export minMem
export maxMem
export cutOff
export nCutOff
export readLength
export barcodeLength
export spacerLength
export filtersSet
export readTypes
export repFilt
export readOut
export picard
export GATK
# Print out options used to log file
touch $logFile
echo "Run identifier: " $runIdentifier | tee -a ${logFile}
echo "Program path: " $DSpath | tee -a ${logFile}
echo "Reference genome: " $alignRef | tee -a ${logFile}
echo "Barcode length: " $barcodeLength | tee -a ${logFile}
echo "Spacer length: " $spacerLength | tee -a ${logFile}
echo "Post-tag_to_header read length: " $readLength | tee -a ${logFile}
echo "Repetitive tag filter length: " $repFilt | tee -a ${logFile}
echo "Minimum family size: " $minMem | tee -a ${logFile}
echo "Maximum family size: " $maxMem | tee -a ${logFile}
echo "Consensus cutoff: " $cutOff | tee -a ${logFile}
echo "Consensus N cutoff: " $nCutOff | tee -a ${logFile}
echo "Read types: " $readTypes | tee -a ${logFile}
echo "Filters: " $filtersSet | tee -a ${logFile}
echo "" | tee -a ${logFile}
# Step 2: Run tag_to_header.py on input files
echo "Starting Run" | tee -a ${logFile}
echo "tag_to_header starting" | tee -a ${logFile}
date | tee -a ${logFile}
echo "" | tee -a ${logFile}
python ${DSpath}/tag_to_header.py \
--infile1 $read1in \
--infile2 $read2in \
--outfile1 ${runIdentifier}.seq1.fq.smi \
--outfile2 ${runIdentifier}.seq2.fq.smi \
--barcode_length $barcodeLength \
--spacer_length $spacerLength \
--read_out $readOut
# Step 3: Align sequences
echo "Aligning with BWA" | tee -a ${logFile}
date | tee -a ${logFile}
bwa aln $alignRef ${runIdentifier}.seq1.fq.smi > ${runIdentifier}.seq1.aln
bwa aln $alignRef ${runIdentifier}.seq2.fq.smi > ${runIdentifier}.seq2.aln
bwa sampe \
-s $alignRef \
${runIdentifier}.seq1.aln \
${runIdentifier}.seq2.aln \
${runIdentifier}.seq1.fq.smi \
${runIdentifier}.seq2.fq.smi \
> ${runIdentifier}.pe.sam
# Step 4: Sort aligned sequences
echo "Sorting aligned sequences" | tee -a ${logFile}
date | tee -a ${logFile}
samtools view -Sbu ${runIdentifier}.pe.sam \
| samtools sort - ${runIdentifier}.pe.sort
# Step 5: Run Consensus Maker
echo "Starting Consensus Maker" | tee -a ${logFile}
date | tee -a ${logFile}
python ${DSpath}/ConsensusMaker.py \
--infile ${runIdentifier}.pe.sort.bam \
--tagfile ${runIdentifier}.pe.tagcounts \
--outfile ${runIdentifier}.sscs.bam \
--minmem $minMem \
--maxmem $maxMem \
--readlength $readLength \
--cutoff $cutOff \
--Ncutoff $nCutOff \
--read_type $readTypes \
--filt $filtersSet \
--isize $iSize
# Step 6: Sort SSCSs
echo "Sorting SSCSs" | tee -a ${logFile}
date | tee -a ${logFile}
samtools view -bu ${runIdentifier}.sscs.bam \
| samtools sort - ${runIdentifier}.sscs.sort.bam
# Step 7: Index sorted SSCSs
echo "Indexing sorted SSCSs" | tee -a ${logFile}
date | tee -a ${logFile}
samtools index ${runIdentifier}.sscs.sort.bam
# Step 8: Run Duplex Maker
echo "Starting Duplex Maker" | tee -a ${logFile}
date | tee -a ${logFile}
python ${DSpath}/DuplexMaker.py \
--infile ${runIdentifier}.sscs.sort.bam \
--outfile ${runIdentifier}.dcs.bam \
--Ncutoff $nCutOff \
--readlength $readLength \
--barcode_length $barcodeLength
# Step 9: Align DCSs
echo "Aligning DCSs" | tee -a ${logFile}
date | tee -a ${logFile}
bwa aln $alignRef ${runIdentifier}.dcs.r1.fq > ${runIdentifier}.dcs.r1.aln
bwa aln $alignRef ${runIdentifier}.dcs.r2.fq > ${runIdentifier}.dcs.r2.aln
bwa sampe \
-s $alignRef ${runIdentifier}.dcs.r1.aln \
${runIdentifier}.dcs.r2.aln ${runIdentifier}.dcs.r1.fq \
${runIdentifier}.dcs.r2.fq \
> ${runIdentifier}.dcs.sam
# Step 10: Sort aligned DCSs
echo "Sorting aligned DCSs" | tee -a ${logFile}
date | tee -a ${logFile}
samtools view -Sbu ${runIdentifier}.dcs.sam \
| samtools sort - ${runIdentifier}.dcs.aln.sort
# Step 11: Index sorted DCSs
echo "Indexing sorted DCSs" | tee -a ${logFile}
date | tee -a ${logFile}
samtools index ${runIdentifier}.dcs.aln.sort.bam
echo "Finished with run " $runIdentifier \
| tee -a ${logFile}
date | tee -a ${logFile}
echo "Attempting post-processing for: " $runIdentifier \
| tee -a ${logFile}
date | tee -a ${logFile}
###############################################################################
# POST_PROCESSING
###############################################################################
# Setup
processFile=${runIdentifier}.dcs.aln.sort.bam
minDepth=100
minClonality=0
maxClonality=0.01
export processFile
export minDepth
export minClonality
export maxClonality
echo "File for processing: " $processFile | tee -a ${logFile}
echo "Minimum Depth: " $minDepth | tee -a ${logFile}
echo "Minimum Clonality: " $minClonality | tee -a ${logFile}
echo "Maximum Clonality: " $maxClonality | tee -a ${logFile}
# Filter for mapping reads
echo "Filtering" | tee -a ${logFile}
samtools view \
-F 4 \
-b ${runIdentifier}.dcs.aln.sort.bam \
> ${runIdentifier}.dcs.filt.bam
# Clipping final file
echo "Clipping Final File" | tee -a ${logFile}
java -jar $picard/AddOrReplaceReadGroups.jar \
INPUT=${runIdentifier}.dcs.filt.bam \
OUTPUT=${runIdentifier}.dcs.filt.readgroups.bam \
RGLB=UW \
RGPL=Illumina \
RGPU=ATATAT \
RGSM=default
samtools index ${runIdentifier}.dcs.filt.readgroups.bam
java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R $alignRef \
-I ${runIdentifier}.dcs.filt.readgroups.bam \
-o ${runIdentifier}.dcs.filt.readgroups.intervals
java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \
-T IndelRealigner \
-R $alignRef \
-I ${runIdentifier}.dcs.filt.readgroups.bam \
-targetIntervals ${runIdentifier}.dcs.filt.readgroups.intervals \
-o ${runIdentifier}.dcs.filt.readgroups.realign.bam
java -Xmx2g -jar $gaTK/GenomeAnalysisTK.jar \
-T ClipReads \
-I ${runIdentifier}.dcs.filt.readgroups.realign.bam \
-o ${runIdentifier}.dcs.filt.readgroups.clipped.bam \
-R $alignRef \
--cyclesToTrim "1-4,117-137" \
--clipRepresentation SOFTCLIP_BASES
# 137-4-21 = 112
# Generating stats from final file
# -B (--no-BAQ) Disable per-Base Alignment Quality... Qualities exist as ASCII J
# -A (--count-orphans) Do not discard anmalous read pairs
# -d 500000 (--max-depth INT)
samtools mpileup \
-B \
-A \
-d 500000 \
-f $alignRef \
${runIdentifier}.dcs.filt.readgroups.clipped.bam \
| tee ${runIdentifier}.dcs.pileup \
| python ${DSpath}/CountMuts.py \
-d $minDepth \
-c $minClonality \
-C $maxClonality \
| tee ${runIdentifier}.dcs.aln.sort.bam.d${minDepth}-c${minClonality}-${maxClonality}.unique.countmuts
cat ${runIdentifier}.dcs.final.bam.pileup \
| python ${DSpath}/CountMuts.py \
-d $minDepth \
-c $minClonality \
-C $maxClonality \
> ${runIdentifier}.dcs.unique.countmuts
python ${DSpath}/mut-position.py \
-i ${runIdentifier}.dcs.final.bam.pileup \
-o ${runIdentifier}.dcs.aln.sort.bam.d${minDepth}-c${minClonality}-${maxClonality}.mutpos \
-d $minDepth \
-c $minClonality \
-C $maxClonality
echo "Finished post-processing for: " $runIdentifier \
| tee -a ${logFile}
date | tee -a ${logFile}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment