Skip to content

Instantly share code, notes, and snippets.

@robsyme
Created November 20, 2024 21:46
Show Gist options
  • Save robsyme/a830d29046082e16b0fb26b6055e2cb2 to your computer and use it in GitHub Desktop.
Save robsyme/a830d29046082e16b0fb26b6055e2cb2 to your computer and use it in GitHub Desktop.
Example Genotyping script using new workflow output
#!/usr/bin/env nextflow
nextflow.preview.output = true
/*
* Pipeline parameters
*/
// Primary input (file of input files, one per line)
params.reads_bam = "${projectDir}/data/sample_bams.txt"
// Output directory
params.outdir = "results_genomics"
// Accessory files
params.reference = "${projectDir}/data/ref/ref.fasta"
params.reference_index = "${projectDir}/data/ref/ref.fasta.fai"
params.reference_dict = "${projectDir}/data/ref/ref.dict"
params.intervals = "${projectDir}/data/ref/intervals.bed"
params.cohort_name = "family_trio"
/*
* Generate BAM index file
*/
process SAMTOOLS_INDEX {
container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'
// publishDir params.outdir, mode: 'symlink'
input:
path input_bam
output:
tuple path(input_bam), path("${input_bam}.bai")
script:
"""
samtools index '$input_bam'
"""
}
/*
* Call variants with GATK HaplotypeCaller
*/
process GATK_HAPLOTYPECALLER {
container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
publishDir params.outdir, mode: 'symlink'
input:
tuple path(input_bam), path(input_bam_index)
path ref_fasta
path ref_index
path ref_dict
path interval_list
output:
path "*.vcf" , emit: vcf
path "*.vcf.idx" , emit: idx
script:
"""
gatk HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-O ${input_bam}.g.vcf \
-L ${interval_list} \
-ERC GVCF
"""
}
/*
* Combine GVCFs into GenomicsDB datastore
*/
process GATK_JOINTGENOTYPING {
container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
publishDir params.outdir, mode: 'copy'
input:
path all_gvcfs
path all_idxs
path interval_list
val cohort_name
path ref_fasta
path ref_index
path ref_dict
output:
path "${cohort_name}.joint.vcf", emit: vcf
path "${cohort_name}.joint.vcf.idx", emit: idx
script:
def gvcfs_line = all_gvcfs.collect { gvcf -> "-V $gvcf" }.join(' ')
"""
gatk GenomicsDBImport \
${gvcfs_line} \
-L ${interval_list} \
--genomicsdb-workspace-path ${cohort_name}_gdb
gatk GenotypeGVCFs \
-R ${ref_fasta} \
-V gendb://${cohort_name}_gdb \
-L ${interval_list} \
-O ${cohort_name}.joint.vcf
"""
}
workflow {
main:
// Create input channel from a text file listing input file paths
reads_ch = Channel.fromPath(params.reads_bam).splitText()
// Load the file paths for the accessory files (reference and intervals)
ref_file = file(params.reference)
ref_index_file = file(params.reference_index)
ref_dict_file = file(params.reference_dict)
intervals_file = file(params.intervals)
// Create index file for input BAM file
SAMTOOLS_INDEX(reads_ch)
// Call variants from the indexed BAM file
GATK_HAPLOTYPECALLER(
SAMTOOLS_INDEX.out,
ref_file,
ref_index_file,
ref_dict_file,
intervals_file
)
// Collect variant calling outputs across samples
all_gvcfs_ch = GATK_HAPLOTYPECALLER.out.vcf
.dump(tag:"beforeCollect")
.collect()
.dump(tag:"afterCollect")
all_idxs_ch = GATK_HAPLOTYPECALLER.out.idx.collect()
// Combine GVCFs into a GenomicsDB datastore
GATK_JOINTGENOTYPING(
all_gvcfs_ch,
all_idxs_ch,
intervals_file,
params.cohort_name,
ref_file,
ref_index_file,
ref_dict_file
)
GATK_JOINTGENOTYPING.out.vcf.dump(tag: "foo")
publish:
GATK_JOINTGENOTYPING.out.vcf >> 'jointGenotyped'
}
output {
'jointGenotyped' {
path 'vcfs'
mode 'copy'
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment