Skip to content

Instantly share code, notes, and snippets.

@pgsin
Last active August 16, 2019 22:11
Show Gist options
  • Save pgsin/15a7adb6e6346f258212521e374da1bf to your computer and use it in GitHub Desktop.
Save pgsin/15a7adb6e6346f258212521e374da1bf to your computer and use it in GitHub Desktop.
import os
import re
GENOME = 'data/22'
rule all:
input: expand('output/reports/{SAMPLE}', SAMPLE=[os.path.splitext(i)[0] for i in os.listdir('input/') if not i.startswith('.') and i.endswith(".bam")])
output: touch('output/done')
rule reference_index:
input: GENOME+'.fa'
output: GENOME+'.fa.fai'
shell: 'samtools faidx {input}'
rule reference_dict:
input: GENOME+'.fa'
output: GENOME+'.dict'
shell: 'java -jar $PICARD CreateSequenceDictionary R={input} O={output}'
rule index_bam:
input: 'input/{SAMPLE}.bam'
output: 'input/{SAMPLE}.bam.bai'
shell: 'samtools index {input}'
rule vc_haplotypecaller:
input: file = 'input/{SAMPLE}.bam',
index = 'input/{SAMPLE}.bam.bai',
genomeFa = GENOME+'.fa',
genomeFai = GENOME+'.fa.fai',
genomeDict = GENOME+'.dict'
output: 'output/tmp/{SAMPLE}.haplotypecaller.vcf'
shell: 'java -jar $GATK -R {input.genomeFa} -T HaplotypeCaller -I {input.file} -o {output}'
rule vc_freebayes:
input: file = 'input/{SAMPLE}.bam',
index = 'input/{SAMPLE}.bam.bai',
genomeFa = GENOME+'.fa',
genomeFai = GENOME+'.fa.fai',
genomeDict = GENOME+'.dict'
output: 'output/tmp/{SAMPLE}.freebayes.vcf'
shell: 'freebayes -f {input.genomeFa} {input.file} > {output}'
rule vc_samtools:
input: file = 'input/{SAMPLE}.bam',
index = 'input/{SAMPLE}.bam.bai',
genomeFa = GENOME+'.fa',
genomeFai = GENOME+'.fa.fai',
genomeDict = GENOME+'.dict'
output: 'output/tmp/{SAMPLE}.samtools.vcf'
shell: 'samtools mpileup -uf {input.genomeFa} {input.file} | bcftools view -vcg - > {output}'
rule index_vcf:
input: 'output/tmp/{SAMPLE}.{METHOD}.vcf'
output: 'output/tmp/{SAMPLE}.{METHOD}.vcf.gz'
shell: '''
bgzip {input}
tabix -p vcf {output}
'''
rule intersect_vcf:
input: file1 = 'output/tmp/{SAMPLE}.{METHOD1}.vcf.gz',
file2 = 'output/tmp/{SAMPLE}.{METHOD2}.vcf.gz'
output: 'output/tmp/{SAMPLE}.{METHOD1}.{METHOD2}.vcf.gz'
shell: 'vcf-isec -f -n +2 {input.file1} {input.file2} | bgzip -c > {output}'
rule count_variants:
input: 'output/tmp/{COMPARISON}.vcf.gz'
output: 'output/tmp/{COMPARISON}.count'
shell: 'vcftools --gzvcf {input} 2> {output}'
rule merge:
input: vc1 = 'output/tmp/{SAMPLE}.freebayes.count',
vc12 = 'output/tmp/{SAMPLE}.freebayes.haplotypecaller.count',
vc13 = 'output/tmp/{SAMPLE}.freebayes.samtools.count',
vc2 = 'output/tmp/{SAMPLE}.haplotypecaller.count',
vc23 = 'output/tmp/{SAMPLE}.haplotypecaller.samtools.count',
vc3 = 'output/tmp/{SAMPLE}.samtools.count'
output: 'output/reports/{SAMPLE}'
run:
files = [input.vc1, input.vc12, input.vc13, input.vc2, input.vc23, input.vc3]
a = [re.search(r'\d+', open(f).readlines()[-2]).group() for f in files]
with open(output[0], 'w') as fs:
fs.write('{}\t{}\t{}\n'.format(a[0], a[1], a[2]))
fs.write('{}\t{}\t{}\n'.format(a[3], 'None', a[4]))
fs.write('{}\t{}\t{}\n'.format(a[5], 'None', 'None'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment