Last active
August 16, 2019 22:11
-
-
Save pgsin/15a7adb6e6346f258212521e374da1bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
GENOME = 'data/22' | |
rule all: | |
input: expand('output/reports/{SAMPLE}', SAMPLE=[os.path.splitext(i)[0] for i in os.listdir('input/') if not i.startswith('.') and i.endswith(".bam")]) | |
output: touch('output/done') | |
rule reference_index: | |
input: GENOME+'.fa' | |
output: GENOME+'.fa.fai' | |
shell: 'samtools faidx {input}' | |
rule reference_dict: | |
input: GENOME+'.fa' | |
output: GENOME+'.dict' | |
shell: 'java -jar $PICARD CreateSequenceDictionary R={input} O={output}' | |
rule index_bam: | |
input: 'input/{SAMPLE}.bam' | |
output: 'input/{SAMPLE}.bam.bai' | |
shell: 'samtools index {input}' | |
rule vc_haplotypecaller: | |
input: file = 'input/{SAMPLE}.bam', | |
index = 'input/{SAMPLE}.bam.bai', | |
genomeFa = GENOME+'.fa', | |
genomeFai = GENOME+'.fa.fai', | |
genomeDict = GENOME+'.dict' | |
output: 'output/tmp/{SAMPLE}.haplotypecaller.vcf' | |
shell: 'java -jar $GATK -R {input.genomeFa} -T HaplotypeCaller -I {input.file} -o {output}' | |
rule vc_freebayes: | |
input: file = 'input/{SAMPLE}.bam', | |
index = 'input/{SAMPLE}.bam.bai', | |
genomeFa = GENOME+'.fa', | |
genomeFai = GENOME+'.fa.fai', | |
genomeDict = GENOME+'.dict' | |
output: 'output/tmp/{SAMPLE}.freebayes.vcf' | |
shell: 'freebayes -f {input.genomeFa} {input.file} > {output}' | |
rule vc_samtools: | |
input: file = 'input/{SAMPLE}.bam', | |
index = 'input/{SAMPLE}.bam.bai', | |
genomeFa = GENOME+'.fa', | |
genomeFai = GENOME+'.fa.fai', | |
genomeDict = GENOME+'.dict' | |
output: 'output/tmp/{SAMPLE}.samtools.vcf' | |
shell: 'samtools mpileup -uf {input.genomeFa} {input.file} | bcftools view -vcg - > {output}' | |
rule index_vcf: | |
input: 'output/tmp/{SAMPLE}.{METHOD}.vcf' | |
output: 'output/tmp/{SAMPLE}.{METHOD}.vcf.gz' | |
shell: ''' | |
bgzip {input} | |
tabix -p vcf {output} | |
''' | |
rule intersect_vcf: | |
input: file1 = 'output/tmp/{SAMPLE}.{METHOD1}.vcf.gz', | |
file2 = 'output/tmp/{SAMPLE}.{METHOD2}.vcf.gz' | |
output: 'output/tmp/{SAMPLE}.{METHOD1}.{METHOD2}.vcf.gz' | |
shell: 'vcf-isec -f -n +2 {input.file1} {input.file2} | bgzip -c > {output}' | |
rule count_variants: | |
input: 'output/tmp/{COMPARISON}.vcf.gz' | |
output: 'output/tmp/{COMPARISON}.count' | |
shell: 'vcftools --gzvcf {input} 2> {output}' | |
rule merge: | |
input: vc1 = 'output/tmp/{SAMPLE}.freebayes.count', | |
vc12 = 'output/tmp/{SAMPLE}.freebayes.haplotypecaller.count', | |
vc13 = 'output/tmp/{SAMPLE}.freebayes.samtools.count', | |
vc2 = 'output/tmp/{SAMPLE}.haplotypecaller.count', | |
vc23 = 'output/tmp/{SAMPLE}.haplotypecaller.samtools.count', | |
vc3 = 'output/tmp/{SAMPLE}.samtools.count' | |
output: 'output/reports/{SAMPLE}' | |
run: | |
files = [input.vc1, input.vc12, input.vc13, input.vc2, input.vc23, input.vc3] | |
a = [re.search(r'\d+', open(f).readlines()[-2]).group() for f in files] | |
with open(output[0], 'w') as fs: | |
fs.write('{}\t{}\t{}\n'.format(a[0], a[1], a[2])) | |
fs.write('{}\t{}\t{}\n'.format(a[3], 'None', a[4])) | |
fs.write('{}\t{}\t{}\n'.format(a[5], 'None', 'None')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment