lpenguin · November 8, 2017 14:46 · pgsin · Nov 8, 2017
diff --git a/config.yml b/config.yml
 bowtie2.bin: /srv/common/bin/bowtie2
 bowtie2-build.bin: /srv/common/bin/bowtie2-build
 bowtie2.additional.params: -k 1 
 bowtie2.threads: 20
 bowtie2.index: <some path to index>

 samtools.bin: /srv/common/bin/samtools
 genome_coverage.bin: /srv/common/bin/genomeCoverageBed
diff --git a/run.snakemake.sh b/run.snakemake.sh
 # --drmaa ' -pe make {threads}': how to pass required number of threads to grid engine scheduler 
 # -k: do not stop if one of jobs fail
 # -p: print exec'ed command
 # --latency-wait 120: wait for file timeout (FS syncronisation issue)
 # -j: maximum amount of simultanious jobs
 # all: - target rule name

 snakemake --drmaa-log-dir drmaa_log \
 --drmaa ' -pe make {threads}'  \
 -k \
 -p \
 --latency-wait 120  \
 -j 18 \
 all
diff --git a/Snakefile b/Snakefile
 from os.path import exists

 configfile: 'config.yml'

 BOWTIE2_BIN = config['bowtie2.bin']
 BOWTIE2_INDEX = config['bowtie2.index']
 BOWTIE2_THREADS = config['bowtie2.threads']

 SAMTOOLS_BIN = config['samtools.bin']
 GENOME_COVERAGE_BIN = config['genome_coverage.bin']

 SAMPLES, EXT = glob_wildcards('data/reads/{sample}.R1.{ext}')
 SAMPLES = list(set(SAMPLES))

 ruleorder: _bowtie2_paired_and_single > _bowtie2_paired

 def multiple_exts(template, *extensions):
    def wrapped_input(wildcards):
        for ext in extensions:
            p = template.format(**wildcards) + ext
            if exists(p):
                return p
        return p
    return wrapped_input

 # TODO: pipe intermediate results
 # # map reads and sort bam file
 # bowtie2 -x refgenome --no-unal -U ${WGS_SAMPLE} -S - -p 12 | \
 #  samtools view -bS - | \
 #  samtools sort -m 5G - mapping_result_sorted.bam

 #    -U ${WGS_SAMPLE} consider all reads as unpaired reads
 #    -S -             bowtie output in SAM format, write to standard out
 #    -x refgenome     use reference genome bowtie2-database
 #    --no-unal        suppress SAM records for reads that failed to align
 #    -p 12            use 12 processors

 localrules: _bam_sort, _bam_index, _coverage, _reads_count

 rule all:
    input: 
        cov=expand("data/coverage/{sample}.coverage.txt", sample=SAMPLES),
        counts=expand("data/count/{sample}.count.txt", sample=SAMPLES),

 rule _bowtie2_paired_and_single:
    input: 
        r1=multiple_exts("data/reads/{sample}.R1", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
        r2=multiple_exts("data/reads/{sample}.R2", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
        s=multiple_exts("data/reads/{sample}.S", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
    threads: min(20, BOWTIE2_THREADS)
    output: 
        sam=temp("data/sam/{sample}.sam"),
        log=temp("log/bowtie2/{sample}_R12S.log"),
    log: "log/bowtie2/{sample}_R12S.log"
    shell:
        "{BOWTIE2_BIN} \
        {config[bowtie2.additional.params]} \
        -x {BOWTIE2_INDEX} \
        -S {output.sam} \
        --threads {threads} \
        -1 {input.r1} -2 {input.r2} -U {input.s} \
        --no-unal \
        >{log} 2>&1 \
        "

 rule _bowtie2_paired:
    input: 
        r1=multiple_exts("data/reads/{sample}.R1", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
        r2=multiple_exts("data/reads/{sample}.R2", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
    threads: min(20, BOWTIE2_THREADS)
    output: 
        sam=temp("data/sam/{sample}.sam"),
        log=temp("log/bowtie2/{sample}_R12S.log"),
    log: "log/bowtie2/{sample}_R12S.log"
    shell:
        "{BOWTIE2_BIN} \
        {config[bowtie2.additional.params]} \
        -x {BOWTIE2_INDEX} \
        -S {output.sam} \
        --threads {threads} \
        -1 {input.r1} -2 {input.r2} \
        --no-unal \
        >{log} 2>&1 \
        "

 rule _reads_count:
    input: "log/bowtie2/{sample}_R12S.log"
    output: "data/count/{sample}.count.txt"
    shell: 
        "head {input} -n 1 | awk '{{print $1}}' > {output}"

 rule _bam_convert:
    input: "data/sam/{sample}.sam"
    output: temp("data/bam/{sample}.bam")
    threads: 5
    log: "log/bam_convert/{sample}.log"
    shell: 
        "{SAMTOOLS_BIN} view -@ {threads} -bS {input} -o {output} >{log} 2>&1"

 rule _bam_sort:
    input: "data/bam/{sample}.bam"
    output: "data/bam_sorted/{sample}.bam"
    params: 
        output="data/bam_sorted/{sample}"
    log: "log/bam_sort/{sample}.log"
    shell: 
        "{SAMTOOLS_BIN} sort {input} {params.output} >{log} 2>&1"

 rule _bam_index:
    input: "data/bam_sorted/{sample}.bam"
    output: "data/bam_sorted/{sample}.bam.bai"
    log: "log/bam_index/{sample}.log"
    shell: 
        "{SAMTOOLS_BIN} index {input} >{log} 2>&1"

 rule _coverage:
    input: 
        bam="data/bam_sorted/{sample}.bam",
        bai="data/bam_sorted/{sample}.bam.bai"
    output: "data/coverage/{sample}.coverage.txt"
    shell: 
        "{GENOME_COVERAGE_BIN} -bga -ibam {input.bam} > {output}"
	bowtie2.bin: /srv/common/bin/bowtie2
	bowtie2-build.bin: /srv/common/bin/bowtie2-build
	bowtie2.additional.params: -k 1
	bowtie2.threads: 20
	bowtie2.index: <some path to index>

	samtools.bin: /srv/common/bin/samtools
	genome_coverage.bin: /srv/common/bin/genomeCoverageBed
	# --drmaa ' -pe make {threads}': how to pass required number of threads to grid engine scheduler
	# -k: do not stop if one of jobs fail
	# -p: print exec'ed command
	# --latency-wait 120: wait for file timeout (FS syncronisation issue)
	# -j: maximum amount of simultanious jobs
	# all: - target rule name

	snakemake --drmaa-log-dir drmaa_log \
	--drmaa ' -pe make {threads}' \
	-k \
	-p \
	--latency-wait 120 \
	-j 18 \
	all
	from os.path import exists

	configfile: 'config.yml'

	BOWTIE2_BIN = config['bowtie2.bin']
	BOWTIE2_INDEX = config['bowtie2.index']
	BOWTIE2_THREADS = config['bowtie2.threads']

	SAMTOOLS_BIN = config['samtools.bin']
	GENOME_COVERAGE_BIN = config['genome_coverage.bin']

	SAMPLES, EXT = glob_wildcards('data/reads/{sample}.R1.{ext}')
	SAMPLES = list(set(SAMPLES))

	ruleorder: _bowtie2_paired_and_single > _bowtie2_paired

	def multiple_exts(template, *extensions):
	def wrapped_input(wildcards):
	for ext in extensions:
	p = template.format(**wildcards) + ext
	if exists(p):
	return p
	return p
	return wrapped_input

	# TODO: pipe intermediate results
	# # map reads and sort bam file
	# bowtie2 -x refgenome --no-unal -U ${WGS_SAMPLE} -S - -p 12 \| \
	# samtools view -bS - \| \
	# samtools sort -m 5G - mapping_result_sorted.bam

	# -U ${WGS_SAMPLE} consider all reads as unpaired reads
	# -S - bowtie output in SAM format, write to standard out
	# -x refgenome use reference genome bowtie2-database
	# --no-unal suppress SAM records for reads that failed to align
	# -p 12 use 12 processors

	localrules: _bam_sort, _bam_index, _coverage, _reads_count

	rule all:
	input:
	cov=expand("data/coverage/{sample}.coverage.txt", sample=SAMPLES),
	counts=expand("data/count/{sample}.count.txt", sample=SAMPLES),

	rule _bowtie2_paired_and_single:
	input:
	r1=multiple_exts("data/reads/{sample}.R1", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
	r2=multiple_exts("data/reads/{sample}.R2", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
	s=multiple_exts("data/reads/{sample}.S", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
	threads: min(20, BOWTIE2_THREADS)
	output:
	sam=temp("data/sam/{sample}.sam"),
	log=temp("log/bowtie2/{sample}_R12S.log"),
	log: "log/bowtie2/{sample}_R12S.log"
	shell:
	"{BOWTIE2_BIN} \
	{config[bowtie2.additional.params]} \
	-x {BOWTIE2_INDEX} \
	-S {output.sam} \
	--threads {threads} \
	-1 {input.r1} -2 {input.r2} -U {input.s} \
	--no-unal \
	>{log} 2>&1 \
	"

	rule _bowtie2_paired:
	input:
	r1=multiple_exts("data/reads/{sample}.R1", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
	r2=multiple_exts("data/reads/{sample}.R2", '.fastq', '.fastq.gz', '.fq', '.fq.gz'),
	threads: min(20, BOWTIE2_THREADS)
	output:
	sam=temp("data/sam/{sample}.sam"),
	log=temp("log/bowtie2/{sample}_R12S.log"),
	log: "log/bowtie2/{sample}_R12S.log"
	shell:
	"{BOWTIE2_BIN} \
	{config[bowtie2.additional.params]} \
	-x {BOWTIE2_INDEX} \
	-S {output.sam} \
	--threads {threads} \
	-1 {input.r1} -2 {input.r2} \
	--no-unal \
	>{log} 2>&1 \
	"

	rule _reads_count:
	input: "log/bowtie2/{sample}_R12S.log"
	output: "data/count/{sample}.count.txt"
	shell:
	"head {input} -n 1 \| awk '{{print $1}}' > {output}"

	rule _bam_convert:
	input: "data/sam/{sample}.sam"
	output: temp("data/bam/{sample}.bam")
	threads: 5
	log: "log/bam_convert/{sample}.log"
	shell:
	"{SAMTOOLS_BIN} view -@ {threads} -bS {input} -o {output} >{log} 2>&1"

	rule _bam_sort:
	input: "data/bam/{sample}.bam"
	output: "data/bam_sorted/{sample}.bam"
	params:
	output="data/bam_sorted/{sample}"
	log: "log/bam_sort/{sample}.log"
	shell:
	"{SAMTOOLS_BIN} sort {input} {params.output} >{log} 2>&1"

	rule _bam_index:
	input: "data/bam_sorted/{sample}.bam"
	output: "data/bam_sorted/{sample}.bam.bai"
	log: "log/bam_index/{sample}.log"
	shell:
	"{SAMTOOLS_BIN} index {input} >{log} 2>&1"

	rule _coverage:
	input:
	bam="data/bam_sorted/{sample}.bam",
	bai="data/bam_sorted/{sample}.bam.bai"
	output: "data/coverage/{sample}.coverage.txt"
	shell:
	"{GENOME_COVERAGE_BIN} -bga -ibam {input.bam} > {output}"