Last active
March 24, 2022 03:58
-
-
Save philippmuench/9c38b8feb6f18b3ef048c78c1dd15131 to your computer and use it in GitHub Desktop.
pipeline OligoMM-Antibiotics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
oligomm_antibiotics.snakefile | |
Philipp Muench | |
Maps NGS data to OligoMM reference genomes and performs variant calling | |
----------------------------------------------------------------------- | |
Requirements: | |
samtools | |
http://www.htslib.org/download/ | |
fastp | |
https://github.com/OpenGene/fastp | |
kneaddata | |
https://bitbucket.org/biobakery/kneaddata | |
Trimmomatic | |
http://www.usadellab.org/cms/?page=trimmomatic | |
bowtie2 | |
http://bowtie-bio.sourceforge.net/bowtie2/index.shtml | |
Rscript | |
https://cran.r-project.org/mirrors.html | |
sankeyD3 (R package) | |
https://github.com/fbreitwieser/sankeyD3 | |
networkD3 (R package) | |
https://cran.r-project.org/web/packages/networkD3/index.html | |
bowtie2 | |
http://bowtie-bio.sourceforge.net/bowtie2/ | |
Usage: | |
snakemake --jobs 10 | |
''' | |
# Globals --------------------------------------------------------------------- | |
# list of genomes to be processed (contig names of reference sequence) | |
GENOMES = ["CP022722.1", "CP021420.1", "CP021421.1", "CP021422.1", "CP022712.1", "CP022713.1"] | |
# name of sample to be processed | |
SAMPLES = ["1683d00_S47"] | |
# process many sample at once | |
FILES, = glob_wildcards("../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{files}_R1_001.fastq.gz") | |
# joned reference to map against | |
REF="/net/metagenomics/projects/pmuench_oligomm_ab/reference_genomes/joined/joined_reference.fasta" | |
# dependencies | |
BOWTIE2_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/bowtie2-2.3.5.1-linux-x86_64" | |
PICARD_PATH="/net/metagenomics/projects/pmuench_oligomm_ab/tools/picard.jar" | |
TRIMMOMATIC_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/Trimmomatic-0.39" | |
# DBs | |
KNEADDATA_DB="/net/metagenomics/projects/pmuench_oligomm_ab/kneadData_ref_genomes" | |
KRAKEN_DB="/net/metagenomics/projects/pmuench_oligomm_ab/tools/kraken2/kraken2-2.0.7-beta/db/minikraken2_v2_8GB_201904_UPDATE" | |
# Rules ----------------------------------------------------------------------- | |
rule all: | |
input: | |
# expand("snpEff/{sample}/{genome}.vcf", genome=GENOMES, sample=SAMPLES), | |
# expand("igv_report/{sample}/{genome}.html", genome=GENOMES, sample=SAMPLES) | |
expand("kraken/{sample}/{sample}.report", sample=FILES) | |
rule copy_files: | |
input: | |
from1="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R1_001.fastq.gz", | |
from2="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R2_001.fastq.gz" | |
output: | |
to1="reads/pe/{sample}_R1_001.fastq.gz", | |
to2="reads/pe/{sample}_R2_001.fastq.gz" | |
shell: """ | |
cp {input.from1} {output.to1} | |
cp {input.from2} {output.to2} | |
""" | |
# QC using fastp (https://github.com/OpenGene/fastp), pipeline will use kneaddata | |
rule fastp_pe: | |
input: | |
sample=["reads/pe/{sample}_R1_001.fastq.gz", "reads/pe/{sample}_R2_001.fastq.gz"] | |
output: | |
trimmed=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"], | |
html="report/pe/{sample}.html", | |
json="report/pe/{sample}.json" | |
log: | |
"logs/fastp/pe/{sample}.log" | |
params: | |
extra="" | |
threads: 25 | |
wrapper: | |
"0.36.0/bio/fastp" | |
# QC using kneaddata (https://bitbucket.org/biobakery/kneaddata) | |
rule kneaddata: | |
input: | |
fq1=["reads/pe/{sample}_R1_001.fastq.gz"], | |
fq2=["reads/pe/{sample}_R2_001.fastq.gz"] | |
output: | |
un1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_1.fastq"], | |
un2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_2.fastq"], | |
trimmed1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq"], | |
trimmed2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"] | |
threads: 10 | |
params: | |
folder=["kneaddata/{sample}"], | |
trim=TRIMMOMATIC_DIR, | |
db=KNEADDATA_DB, | |
bowtie2=BOWTIE2_DIR | |
log: | |
"logs/kneaddata/{sample}.log" | |
shell: """ | |
kneaddata --input {input.fq1} --remove-intermediate-output --input {input.fq2} -db {params.db} --output {params.folder} --trimmomatic {params.trim} --bowtie2 {params.bowtie2} -t {threads} > {log} 2> {log} | |
""" | |
# align PE reads to joined reference | |
rule bwa_mem: | |
input: | |
# reads=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq", "kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"] | |
reads=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"] | |
output: | |
"mapped/{sample}.bam" | |
log: | |
"logs/bwa_mem/{sample}.log" | |
params: | |
index=REF, | |
extra=r"-R '@RG\tID:{sample}\tSM:{sample}'", | |
sort="none", # Can be 'none', 'samtools' or 'picard'. | |
sort_order="queryname", # Can be 'queryname' or 'coordinate'. | |
sort_extra="" # Extra args for samtools/picard. | |
threads: 10 | |
wrapper: | |
"0.36.0/bio/bwa/mem" | |
# extract unmapped reads | |
rule extract_unmapped_reads: | |
input: | |
bam=["mapped/{sample}.bam"] | |
output: | |
unmapped=["unmapped/{sample}.bam"], | |
sorted=["unmapped/{sample}_sorted.bam"] | |
shell: """ | |
samtools view -b -f 4 {input.bam} > {output.unmapped} | |
samtools sort {output.unmapped} -o {output.sorted} | |
""" | |
# bam to fast to get unmapped PE reads | |
rule bam_to_fastq: | |
input: | |
bam=["unmapped/{sample}_sorted.bam"] | |
output: | |
fq1=["unmapped/fastq/{sample}_1.fastq"], | |
fq2=["unmapped/fastq/{sample}_2.fastq"] | |
shell: """ | |
bedtools bamtofastq -i {input.bam} -fq {output.fq1} -fq2 {output.fq2} | |
""" | |
# assign taxon to non-mapping reads based on kneaddata output | |
rule kraken2: | |
input: | |
reads1=["unmapped/fastq/{sample}_1.fastq"], | |
reads2=["unmapped/fastq/{sample}_2.fastq"] | |
output: | |
krak=["kraken/{sample}/{sample}.krak"], | |
report=["kraken/{sample}/{sample}.report"] | |
params: | |
db=KRAKEN_DB | |
threads: 8 | |
shell: """ | |
kraken2 --paired --use-names --db {params.db} --threads 20 --output {output.krak} --report {output.report} {input.reads1} {input.reads2} | |
""" | |
# generate html kraken report | |
rule kraken2Sankey: | |
input: | |
report=["kraken/{sample}/{sample}.report"] | |
output: | |
html=["kraken/{sample}/{sample}.html"] | |
shell: """ | |
Rscript scripts/krakenSankey.R {input.report} | |
""" | |
rule samtools_flagstat: | |
input: | |
"mapped/{sample}.bam" | |
output: | |
"mapped/{sample}.bam.flagstat" | |
wrapper: | |
"0.36.0/bio/samtools/flagstat" | |
rule samtools_sort: | |
input: | |
"mapped/{sample}.bam" | |
output: | |
"mapped/{sample}.sorted.bam" | |
threads: # Samtools takes additional threads through its option -@ | |
25 # This value - 1 will be sent to -@. | |
wrapper: | |
"0.36.0/bio/samtools/sort" | |
rule mark_duplicates: | |
input: | |
bam="mapped/{sample}.sorted.bam" | |
output: | |
bam="dedup/{sample}.bam" | |
params: | |
jar=PICARD_PATH | |
log: | |
"logs/picard/dedup/{sample}.log" | |
shell: "java -Xmx10G -jar {params.jar} MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT QUIET=true AS=true METRICS_FILE=metrics VERBOSITY=WARNING TMP_DIR=/scratch/pmuench" | |
rule samtools_index: | |
input: | |
"dedup/{sample}.bam" | |
output: | |
"dedup/{sample}.bam.bai" | |
params: | |
"" # optional params string | |
wrapper: | |
"0.36.0/bio/samtools/index" | |
rule samtools_index_raw: | |
input: | |
"mapped/{sample}.sorted.bam" | |
output: | |
"mapped/{sample}.sorted.bam.bai" | |
params: | |
"" # optional params string | |
wrapper: | |
"0.36.0/bio/samtools/index" | |
# calculate coverage on deduplicated bam files | |
rule samtools_depth_dedup: | |
input: | |
bam="dedup/{sample}.bam" | |
output: | |
stat="coverage/{sample}/{sample}_dedup.coverage.txt.gz" | |
shell: "samtools depth {input.bam} | bgzip > {output.stat}" | |
# calculate coverage on non-deduplicated bam files | |
rule samtools_depth: | |
input: | |
bam="dedup/{sample}.bam" | |
output: | |
stat="coverage/{sample}/{sample}.coverage.txt.gz" | |
shell: "samtools depth {input.bam} | bgzip > {output.stat}" | |
# plot depth on dedup coverage information | |
#rule depth_plot: | |
# input: | |
# stat="dedup/{sample}.coverage.txt.gz" | |
# output: | |
# pdf="coverage_statistics/{sample}.coverage.txt.gz.pdf" | |
# script: "scripts/coverage.R" | |
rule lofreq: | |
input: | |
bam="mapped/{sample}.sorted.bam", | |
bai="mapped/{sample}.sorted.bam.bai", | |
ref=REF | |
output: | |
vcf="lofreq_call/{sample}.vcf" | |
log: | |
"logs/lofreq_call/{sample}.log" | |
threads: 20 | |
shell: "lofreq call-parallel --pp-threads {threads} -f {input.ref} {input.bam} -o {output.vcf} > {log} 2> {log}" | |
rule bgzip: | |
input: | |
vcf="lofreq_call/{sample}.vcf" | |
output: | |
vcfgz="lofreq_call/{sample}.vcf.gz" | |
log: | |
"logs/bgzip/{sample}.log" | |
shell: | |
"bgzip -c {input.vcf} > {output.vcfgz} 2> {log}" | |
rule index_vcf: | |
input: | |
vcfgz="lofreq_call/{sample}.vcf.gz" | |
output: | |
vcfsort="lofreq_call/{sample}.vcf.gz.tbi" | |
log: | |
"logs/sort_vcf/{sample}.log" | |
shell: | |
"tabix -f -p vcf {input.vcfgz}" | |
rule get_chr_list: | |
input: | |
vcf="lofreq_call/{sample}.vcf" | |
output: | |
chr="lofreq_call/{sample}.txt" | |
shell: """cat {input.vcf} | mawk '$1 ~ /^#/ {{next}} {{print $1 | "sort -k1,1 -u"}}' > {output.chr}""" | |
""" | |
sorts and gz a vcf file and indexes | |
""" | |
rule sort_vcf: | |
input: | |
vcf="lofreq_call/{sample}.vcf" | |
output: | |
sorted="lofreq_call/{sample}_sorted.vcf.gz" | |
shell: """ | |
cat {input.vcf} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted} | |
tabix -p vcf {output.sorted} | |
""" | |
""" | |
extracts a genomes from a sorted gz vcfDD | |
""" | |
rule split_vcf: | |
input: | |
vcf="lofreq_call/{sample}_sorted.vcf.gz" | |
output: | |
single="lofreq_splitted/{sample}/{genome}.vcf", | |
sorted="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz", | |
index="lofreq_splitted/{sample}/{genome}.vcf.tbi" | |
shell: """ | |
tabix -h {input.vcf} {wildcards.genome} > {output.single} | |
cat {output.single} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted} | |
tabix -f -p vcf {output.sorted} | |
cp {output.sorted}.tbi {output.index} | |
""" | |
rule split_bam: | |
input: | |
bam="dedup/{sample}.bam" | |
output: | |
single="dedup_splitted/{sample}/{genome}.bam" | |
shell: """ | |
samtools view -bh {input.bam} > {output.single} | |
samtools index {output.single} | |
""" | |
rule annotate_vcf: | |
input: | |
vcf="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz", | |
bed="reference_genomes/{genome}.bed.gz" | |
output: | |
vcf2="lofreq_annotated/{sample}/{genome}.vcf" | |
shell: """ | |
bcftools annotate -a {input.bed} -c CHROM,FROM,TO,GENE -h <(echo '##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">') {input.vcf} > {output.vcf2} | |
""" | |
rule snpEff: | |
input: | |
vcf="lofreq_splitted/{sample}/{genome}.vcf" | |
params: | |
ref="{genome}" | |
output: | |
vcf="snpEff/{sample}/{genome}.vcf", | |
bed="snpEff/{sample}/{genome}.bed", | |
csv="snpEff/{sample}/{genome}.csv", | |
html="snpEff/{sample}/{genome}.html", | |
log: "logs/smpEff/{sample}/{genome}.log" | |
shell: """ | |
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -o bed > {output.bed} | |
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -csvStats {output.csv} > {output.vcf} 2> {log} | |
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -stats {output.html} > {output.vcf} 2> {log} | |
""" | |
rule igv_report: | |
input: | |
fasta="reference_genomes/{genome}.fasta", | |
vcf="snpEff/{sample}/{genome}.vcf", | |
bam="dedup_splitted/{sample}/{genome}.bam", | |
bed="reference_genomes/{genome}.bed.gz" | |
output: | |
path="igv_report/{sample}/{genome}.html" | |
shell: "create_report {input.vcf} {input.fasta} --info-columns ANN --tracks {input.bam} {input.bed} --output {output.path}" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment