Created
May 22, 2020 18:18
-
-
Save moldach/7e8a293c12934fd3f6f080c4d49c335f to your computer and use it in GitHub Desktop.
Snakemake workflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This is a snakemake for variant calling workflow | |
------------------- | |
# An example run of this workflow | |
Usage: | |
snakemake --profile slurm --jobs 20 | |
''' | |
# Directories------------------------------------------------------------------ | |
configfile: "config.yaml" | |
# Setting the names of all directories | |
dir_list = ["LOG_DIR", "BENCHMARK_DIR", "QC_DIR", "TRIM_DIR", "ALIGN_DIR", "MARKDUP_DIR", "CALLING_DIR", "ANNOT_DIR"] | |
dir_names = ["LOGS", "BENCHMARKS", "QC", "TRIMMING", "ALIGNMENT", "MARK_DUPLICATES", "VARIANT_CALLING", "ANNOTATION"] | |
dirs_dict = dict(zip(dir_list, dir_names)) | |
# Load Samples ---------------------------------------------------------------- | |
import pandas as pd | |
# getting the samples information (names, path to r1 & r2) from samples.txt | |
samples_information = pd.read_csv("samples.txt", sep='\t', index_col=False) | |
# get a list of the sample names | |
sample_names = list(samples_information['sample']) | |
sample_locations = list(samples_information['location']) | |
# Rules ----------------------------------------------------------------------- | |
rule all: | |
''' | |
The target rule for the workflow. | |
This lists all of the final products of the workflow | |
''' | |
input: | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.txt', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names), | |
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/categorized-gvs.gvf', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts.gff3', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/distribution_deletions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/distribution_insertions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/categorized_snp_coords.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/contigs.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/snps_frequency.table', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/sorted_gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/splice_junctions.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts_snps_applied.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/variant.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/ambiguous_snps.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/categorized_snps.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/codon_bias.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/exons.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_variant.alignment', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
rule qc_before_trim: | |
input: | |
r1 = expand('{FASTQ_DIR}/{sample}_R1.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names), | |
r2 = expand('{FASTQ_DIR}/{sample}_R2.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names) | |
output: | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-before-trim.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 1000, | |
time = 30 | |
threads: 1 | |
params: | |
dir = expand('{QC_DIR}/{QC_TOOL}/before_trim/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"]) | |
message: """--- Quality check of raw data with FastQC before trimming.""" | |
shell: | |
'module load fastqc/0.11.5;' | |
'fastqc -o {params.dir} -f fastq {input.r1} &' | |
'fastqc -o {params.dir} -f fastq {input.r2}' | |
rule trimming: | |
input: | |
r1 = expand('{FASTQ_DIR}/{sample}_R1.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names), | |
r2 = expand('{FASTQ_DIR}/{sample}_R2.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names) | |
output: | |
r1_paired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
r1_unpaired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
r2_paired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
r2_unpaired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{TRIM_TOOL}-trimming.log', LOG_DIR=dirs_dict["LOG_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names) | |
resources: | |
mem = 1000, | |
time = 120 | |
message: """--- Trimming FASTQ files with Trimmomatic.""" | |
shell: """ | |
module load trimmomatic/0.36; | |
java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.36.jar PE -phred33 \ | |
{input.r1} \ | |
{input.r2} \ | |
{output.r1_paired} \ | |
{output.r1_unpaired} \ | |
{output.r2_paired} \ | |
{output.r2_unpaired} \ | |
ILLUMINACLIP:adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; | |
""" | |
rule qc_before_alignment: | |
input: | |
r1 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
r2 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names) | |
output: | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-before-align.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 1000, | |
time = 30 | |
threads: 1 | |
params: | |
dir = expand('{QC_DIR}/{QC_TOOL}/before_align/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"]) | |
message: """--- Quality check of raw data with FastQC before alignment.""" | |
shell: | |
'module load fastqc/0.11.5;' | |
'fastqc -o {params.dir} -f fastq {input.r1} &' | |
'fastqc -o {params.dir} -f fastq {input.r2}' | |
rule alignment: | |
input: | |
r1 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
r2 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names), | |
ref = config["REF_GENOME"] | |
output: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{ALIGN_TOOL}.log', ALIGN_TOOL=config["ALIGN_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
message: """--- Alignment with BWA.""" | |
threads: 8 | |
resources: | |
mem = 10000, | |
time = 720 | |
params: | |
rg=expand('r"@RG\tID:{sample}\tSM:{sample}"', sample=sample_names) | |
shell: """ | |
module load bwa; | |
module load samtools; | |
bwa mem -t {threads} \ | |
{input.ref} {input.r1} {input.r2} |\ | |
samtools view -Sb - > {output} | |
""" | |
rule sort_bam: | |
input: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
output: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-bamSort.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 2000, | |
time = 90 | |
threads: 2 | |
message: """--- Sort BAM files.""" | |
shell: """ | |
module load samtools; | |
samtools sort {input} -o {output} | |
""" | |
rule index_bam: | |
input: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
output: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-bamIndex.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 2000, | |
time = 720 | |
message: """--- Index BAM files.""" | |
shell: """ | |
module load samtools; | |
samtools index {input} {output} | |
""" | |
rule mark_duplicates: | |
input: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
output: | |
dedup = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
text = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.txt', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{ALIGN_TOOL}.log', LOG_DIR=dirs_dict["LOG_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
resources: | |
mem = 15000, | |
time = 720 | |
message: """--- Mark duplicates.""" | |
threads: 1 | |
shell: """ | |
module load picard/2.17.3; | |
java -Xmx6G -jar $EBROOTPICARD/picard.jar MarkDuplicates \ | |
VALIDATION_STRINGENCY=LENIENT READ_NAME_REGEX=null \ | |
I={input} \ | |
O={output.dedup} \ | |
M={output.text} | |
""" | |
rule index_dedupped_bam: | |
input: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
output: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-indexMarkDups.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 2000, | |
time = 30 | |
message: """--- Inded Sorted, Dedupped BAM files.""" | |
shell: """ | |
module load samtools; | |
samtools index {input} {output} | |
""" | |
rule qc_after_alignment: | |
input: | |
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names), | |
output: | |
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names), | |
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-after-align.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 1000, | |
time = 30 | |
threads: 1 | |
params: | |
dir = expand('{QC_DIR}/{QC_TOOL}/after_align/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"]) | |
message: """--- Quality check of raw data with FastQC after alignment.""" | |
shell: | |
""" | |
module load fastqc/0.11.5 | |
fastqc -o {params.dir} -f bam {input} | |
""" | |
rule variant_calling: | |
input: | |
ref = config["REF_GENOME"], | |
bam = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names) | |
output: | |
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names), | |
log: expand('{LOG_DIR}/{sample}-{CALLING_TOOL}.log', CALLING_TOOL=config["CALLING_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 5000, | |
time = 720 | |
message: """--- Varscan pileup2snp.""" | |
shell: """ | |
module load samtools; | |
module load java/13.0.1; | |
echo -n "I II III IV V X MtDNA" |\ | |
xargs -d " " -n 1 -P 7 -I {{}} /bin/bash -c \ | |
"samtools mpileup -r {{}} \ | |
-f ~/projects/def-mtarailo/common/indexes/WS265_wormbase/{{}}.fa alignment/470.sorted.dedupped.bam |\ | |
java -Xmx5G -jar ~/projects/def-mtarailo/common/tools/VarScan.v2.3.9.jar pileup2snp \ | |
--variants \ | |
--min-coverage 5 \ | |
--min-avg-qual 30 \ | |
--min-var-freq 0.9 > {{}}.vcf" | |
awk ' | |
FNR==1 && NR!=1 {{ while (/^<header>/) getline; }} | |
1 {{print}} | |
' *.vcf > {output} | |
rm I.vcf II.vcf III.vcf IV.vcf V.vcf X.vcf MtDNA.vcf | |
""" | |
rule prep_gff3: | |
input: expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names) | |
output: expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{CALLING_TOOL}-prepGff3.log', CALLING_TOOL=config["CALLING_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
resources: | |
mem = 2000, | |
time = 30 | |
message: """--- Prepping VCF->TSV for CooVar.""" | |
shell: """ | |
sed 1d {input} |\ | |
awk "{{print "SNP\\t"\$1"\\t"\$2"\\t"\$3"\\t"\$4}}" > {output} | |
""" | |
rule variant_annotation: | |
input: | |
tsv = expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names), | |
ref = config["REF_GENOME"], | |
gff3 = config["GENOME_ANNOTATION"] | |
output: | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/categorized-gvs.gvf', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts.gff3', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/distribution_deletions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/distribution_insertions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/categorized_snp_coords.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/contigs.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/snps_frequency.table', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/sorted_gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/splice_junctions.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts_snps_applied.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/variant.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/ambiguous_snps.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/categorized_snps.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/codon_bias.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/exons.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_variant.alignment', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names), | |
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names) | |
params: | |
prefix = expand("{ANNOT_DIR}/{ANNOT_TOOL}/{sample}", ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names) | |
log: expand('{LOG_DIR}/{sample}-{ANNOT_TOOL}.log', ANNOT_TOOL=config["ANNOT_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names) | |
message: """--- Annotating Variants.""" | |
resources: | |
mem = 12000, | |
time = 1440 | |
threads: 6 | |
shell: """ | |
module load bioperl; | |
perl ~/projects/def-mtarailo/common/tools/CooVar/coovar.pl \ | |
--cores 6 \ | |
-e {input.gff3} \ | |
-r {input.ref} \ | |
-t {input.tsv} \ | |
-o {params.prefix} \ | |
--circos | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment