Skip to content

Instantly share code, notes, and snippets.

@clintval
Created October 23, 2025 21:34
Show Gist options
  • Select an option

  • Save clintval/5c4b1d0295f4da1195ac89ee80c59b04 to your computer and use it in GitHub Desktop.

Select an option

Save clintval/5c4b1d0295f4da1195ac89ee80c59b04 to your computer and use it in GitHub Desktop.
A shitty exome sequencing pipeline
rule all:
input:
#######################################################################
#
# Pipeline #1 Flow cell QC and Demultiplexing
#
#######################################################################
# check_illumina_directory
f'{run_output}/logs/CheckIlluminaDirectory.log',
# collect_illumina_lane_metrics
f'{run_output}/basecalling/{run_barcode}.illumina_lane_metrics.txt',
f'{run_output}/basecalling/{run_barcode}.illumina_phasing_metrics.txt',
# extract_illumina_barcodes
expand(f'{run_output}/basecalling/barcodes.{{lane}}/barcode_counts.lane-{{lane}}.metrics.txt', lane=lanes),
# collect_illumina_basecalling_metrics
expand(f'{run_output}/basecalling/basecalling.lane-{{lane}}.metrics.txt', lane=lanes),
# collect_quality_yield_metrics
expand(f'{run_output}/{{name}}/{{name}}.unmapped.quality_yield_metrics.txt', name=sample_names),
#######################################################################
#
# Pipeline #2 Alignment
#
#######################################################################
# align_with_bwa_mem, sort_sam_files, mark_duplicates
expand(f'{run_output}/{{name}}/{{name}}.mapped.bam', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.duplicate_metrics.txt', name=sample_names),
#######################################################################
#
# Pipeline #3 QC Mapped BAM
#
#######################################################################
# collect_alignment_summary_metrics
expand(f'{run_output}/{{name}}/{{name}}.mapped.alignment_summary_metrics.txt', name=sample_names),
# collect_hs_metrics
expand(f'{run_output}/{{name}}/{{name}}.mapped.hybrid_selection_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.hybrid_selection_metrics.per_target.txt', name=sample_names),
# collect_insert_size_metrics
expand(f'{run_output}/{{name}}/{{name}}.mapped.insert_size_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.insert_size_metrics.pdf', name=sample_names),
# collect_sequencing_artifact_metrics
expand(f'{run_output}/{{name}}/{{name}}.mapped.bait_bias_detail_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.bait_bias_summary_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.error_summary_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.pre_adapter_detail_metrics.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.pre_adapter_summary_metrics.txt', name=sample_names),
# collect_base_distribution_by_cycle
expand(f'{run_output}/{{name}}/{{name}}.mapped.base_distribution_by_cycle.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.base_distribution_by_cycle.pdf', name=sample_names),
# collect_quality_score_distribution
expand(f'{run_output}/{{name}}/{{name}}.mapped.quality_score_distribution.txt', name=sample_names),
expand(f'{run_output}/{{name}}/{{name}}.mapped.quality_score_distribution.pdf', name=sample_names),
# collect_gc_bias_metrics
expand(f'{run_output}/{{name}}/{{name}}.mapped.gc_bias_metrics.txt', name=sample_names),
rule create_basecalling_params:
output:
barcode_params=f'{run_output}/basecalling/barcode_params.{{lane}}.txt',
library_params=f'{run_output}/basecalling/library_params.{{lane}}.txt'
run: sample_sheet.to_picard_basecalling_params(
directory=f'{run_output}/basecalling/',
bam_prefix=run_output,
lanes=lanes)
rule check_illumina_directory:
input: basecalls_dir
output: f'{run_output}/logs/CheckIlluminaDirectory.log'
priority: 100
params: read_structure=read_structure, lanes=lanes
log: f'{run_output}/logs/CheckIlluminaDirectory.log'
wrapper: f'{wrapper_uri}/picard_check_illumina_directory'
rule collect_illumina_lane_metrics:
input: run_folder
output:
lane_metrics=f'{run_output}/basecalling/{run_barcode}.illumina_lane_metrics.txt',
phasing_metrics=f'{run_output}/basecalling/{run_barcode}.illumina_phasing_metrics.txt'
params:
read_structure=read_structure,
file_extension='.txt'
log: f'{run_output}/logs/CollectIlluminaLaneMetrics.{run_barcode}.log'
wrapper: f'{wrapper_uri}/picard_collect_illumina_lane_metrics'
rule extract_illumina_barcodes:
input:
basecalls_dir=basecalls_dir,
barcode_file=rules.create_basecalling_params.output.barcode_params
output:
metrics_file=f'{run_output}/basecalling/barcodes.{{lane}}/barcode_counts.lane-{{lane}}.metrics.txt',
barcodes_dir=f'{run_output}/basecalling/barcodes.{{lane}}'
threads: max(1, int(threads / len(lanes)))
params:
lane='{lane}',
compress_outputs=True,
read_structure=read_structure
log: f'{run_output}/logs/ExtractIlluminaBarcodes.{{lane}}.log'
wrapper: f'{wrapper_uri}/picard_extract_illumina_barcodes'
rule collect_illumina_basecalling_metrics:
input:
basecalls_dir=basecalls_dir,
barcodes_dir=rules.extract_illumina_barcodes.output.barcodes_dir,
barcode_file=rules.create_basecalling_params.output
output: f'{run_output}/basecalling/basecalling.lane-{{lane}}.metrics.txt',
params:
lane='{lane}',
read_structure=read_structure
log: f'{run_output}/logs/CollectIlluminaBasecallingMetrics.{{lane}}.log'
wrapper: f'{wrapper_uri}/picard_collect_illumina_basecalling_metrics'
rule illumina_basecalls_to_sam:
input:
basecalls_dir=basecalls_dir,
barcodes_dir=rules.extract_illumina_barcodes.output.barcodes_dir,
library_params=rules.create_basecalling_params.output.library_params
output: list_unmerged_sample_bam()
threads: max(1, int(threads / len(lanes)))
resources:
gc_time_limit=50,
gc_heap_free_limit=10,
malloc=lambda wildcards, attempt: int(attempt * 1.5 * 8192),
samjdk_buffer_size=131072,
use_async_io_read_samtools=1,
use_async_io_write_samtools=1
params:
lane='{lane}',
run_barcode=run_barcode,
sequencing_center=sequencing_center,
run_start_date=run_start_date,
read_structure=read_structure,
adapters_to_check=['INDEXED', 'NEXTERA_V2', 'FLUIDIGM'],
include_non_pf_reads=False
log: f'{run_output}/logs/IlluminaBasecallsToSam.{{lane}}.log'
wrapper: f'{wrapper_uri}/picard_illumina_basecalls_to_sam'
rule collect_quality_yield_metrics:
input: list_unmapped_bams_for_sample_name
output: f'{run_output}/{{name}}/{{name}}.unmapped.quality_yield_metrics.txt'
params:
assume_sorted=True,
validation_stringency='SILENT'
log: f'{run_output}/logs/CollectQualityYieldMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_quality_yield_metrics'
rule merge_sam_files:
input: list_unmapped_bams_for_sample_name
output: temp(f'{run_output}/{{name}}/{{name}}.unmapped.bam')
resources:
samjdk_buffer_size=131072,
use_async_io_read_samtools=1,
use_async_io_write_samtools=1
params:
sort_order='coordinate',
use_threading=True
log: f'{run_output}/logs/MergeSameFiles.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_merge_sam_files'
rule align_with_bwa_mem:
input:
reference=reference,
unmapped_bam=rules.merge_sam_files.output
output: f'{run_output}/{{name}}/{{name}}.mapped.raw.bam'
threads: max(1, int(threads / len(lanes)))
params:
sam_to_fastq={
'clipping_action': 'N',
'clipping_attribute': 'XT',
'clipping_min_length': 25,
'interleave': True,
'include_non_pf_reads': False},
bwa_mem={'p': True, 'v': 2},
merge_bam_alignment={
'aligner_proper_pair_flags': False,
'attributes_to_retain': ['X0', 'ZS', 'ZI', 'ZM', 'ZC', 'ZN'],
'clip_adapters': False,
'expected_orientations': 'FR',
'max_insertions_or_deletions': -1,
'sort_order': 'coordinate'}
log: f'{run_output}/logs/AlignWithBwaMem.{{name}}.log'
wrapper: f'{wrapper_uri}/bwa_mem'
rule mark_duplicates:
input: rules.align_with_bwa_mem.output
output:
bam=f'{run_output}/{{name}}/{{name}}.mapped.bam',
metrics=f'{run_output}/{{name}}/{{name}}.mapped.duplicate_metrics.txt'
params: create_index=True
log: f'{run_output}/logs/MarkDuplicates.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_mark_duplicates'
rule collect_hs_metrics:
input:
reference=reference,
bam=rules.mark_duplicates.output.bam,
bait_intervals=bait_intervals,
output:
summary_output=f'{run_output}/{{name}}/{{name}}.mapped.hybrid_selection_metrics.txt',
per_target_coverage=f'{run_output}/{{name}}/{{name}}.mapped.hybrid_selection_metrics.per_target.txt',
resources:
gc_time_limit=50,
gc_heap_free_limit=10,
malloc=lambda wildcards, attempt: int(attempt * 1.5 * 4096),
samjdk_buffer_size=131072
log: f'{run_output}/logs/CollectHsMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_hs_metrics'
rule alignment_summary_metrics:
input:
reference=reference,
bam=rules.mark_duplicates.output.bam
output: f'{run_output}/{{name}}/{{name}}.mapped.alignment_summary_metrics.txt'
log: f'{run_output}/logs/CollectAlignmentSummaryMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_alignment_summary_metrics'
rule collect_insert_size_metrics:
input: rules.mark_duplicates.output.bam
output:
metrics_file=f'{run_output}/{{name}}/{{name}}.mapped.insert_size_metrics.txt',
histogram_file=f'{run_output}/{{name}}/{{name}}.mapped.insert_size_metrics.pdf'
log: f'{run_output}/logs/CollectInsertSizeMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_insert_size_metrics'
rule collect_sequencing_artifact_metrics:
input:
reference=reference,
bam=rules.mark_duplicates.output.bam
output:
f'{run_output}/{{name}}/{{name}}.mapped.bait_bias_detail_metrics.txt',
f'{run_output}/{{name}}/{{name}}.mapped.bait_bias_summary_metrics.txt',
f'{run_output}/{{name}}/{{name}}.mapped.error_summary_metrics.txt',
f'{run_output}/{{name}}/{{name}}.mapped.pre_adapter_detail_metrics.txt',
f'{run_output}/{{name}}/{{name}}.mapped.pre_adapter_summary_metrics.txt'
params:
output=f'{run_output}/{{name}}/{{name}}.mapped',
file_extension='.txt'
resources:
malloc=lambda wildcards, attempt: int(attempt * 1.5 * 4096)
log: f'{run_output}/logs/CollectSequencingArtifactMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_sequencing_artifact_metrics'
rule collect_oxo_g_metrics:
input:
reference=reference,
bam=rules.mark_duplicates.output.bam
output: f'{run_output}/{{name}}/{{name}}.mapped.oxo_g_metrics.txt'
log: f'{run_output}/logs/CollectOxoGMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_oxo_g_metrics'
rule collect_base_distribution_by_cycle:
input: rules.mark_duplicates.output.bam
output:
metrics_file=f'{run_output}/{{name}}/{{name}}.mapped.base_distribution_by_cycle.txt',
chart_output=f'{run_output}/{{name}}/{{name}}.mapped.base_distribution_by_cycle.pdf'
log: f'{run_output}/logs/CollectBaseDistributionByCycle.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_base_distribution_by_cycle'
rule collect_quality_score_distribution:
input: rules.mark_duplicates.output.bam
output:
metrics_file=f'{run_output}/{{name}}/{{name}}.mapped.quality_score_distribution.txt',
chart_output=f'{run_output}/{{name}}/{{name}}.mapped.quality_score_distribution.pdf'
log: f'{run_output}/logs/QualityScoreDistribution.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_quality_score_distribution'
rule collect_gc_bias_metrics:
input: rules.mark_duplicates.output.bam
output:
metrics_file=f'{run_output}/{{name}}/{{name}}.mapped.gc_bias_metrics.txt',
chart_output=f'{run_output}/{{name}}/{{name}}.mapped.gc_bias_metrics.pdf',
summary_output=f'{run_output}/{{name}}/{{name}}.mapped.gc_bias_metrics.summary.txt'
log: f'{run_output}/logs/CollectGcBiasMetrics.{{name}}.log'
wrapper: f'{wrapper_uri}/picard_collect_gc_bias_metrics'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment