GiulioCentorame · March 21, 2025 14:24
diff --git a/Crash course in Snakeamake syntax b/Crash course in Snakeamake syntax
 Crash course in Snakemake syntax by Giulio Centorame

 2025-03-22

 1. Save this Snakefile in any folder where you can generate a few text files
 2. Read the actual file
 3. Dry run with
    snakemake -n
  or see all the individual jobs with
    snakemake -np
 4. Run the workflow using
    snakemake -call
 5. Try to edit some of the commands and repeat from step 3.

 Hope this is useful!
diff --git a/Snakefile b/Snakefile
 # Okay so here's an example Snakefile

 # Place it in an example folder and it will generate a bunch of files
 # for you

 # Importing packages works like normal python
 from snakemake.utils import min_version
 min_version("8.0")

 # Storing all the outputs here'
 #SCRATCH = "/PURR/MEOW/my_scratch/"
 # A more reasonable path
 SCRATCH = "outputs"

 # All the penguins species that I know of
 species = ["adelie", "chinstrap", "gentoo"]

 # The most important rule of all is

 rule all:
    input:
      f"{SCRATCH}/final_file.tsv" 
      
 # The first rule of a workflow is the target rule (called "all" by
 # convention). ONLY the rules upstream of "all" will be run. Note that 
 # all the paths in any rule are strings, so you can (should) use f-strings
 # to make the paths more readable

 # We're gonna define all the rules below, but you can add rules to
 # a Snakefile by using `include` 
 #include: "my_rules.smk"

 # Let's define our first proper rule
 rule download_penguin_table:
    # This is a special rule, it just creates a file, you don't need inputs!
    output:
        f"{SCRATCH}/intermediate_files/penguins_size.csv"
    # I have a long path to a file to download, so I'd rather just shove it in a params
    # Those just get passed to the script when you reference them
    # e.g. see below
    params:
      URL = "https://raw.githubusercontent.com/dickoa/penguins/refs/heads/master/data/penguins_size.csv"
    # There are two types of shell directive:
    # "single line" and
    # """
    # multi
    # line
    # """
    # Multi-line shell directives allow the use of newline and quotation marks,
    #, otherwise they're identical.
    shell:
        "wget -nd -O {output} {params.URL}"
    # Anything in curly brackets will be replaced by Snakemake with either
    # a part of a rule (if using the words `input`, `output` etc)
    # or a wildcard. Wildcards are best described with a more elaborate
    # example, so I'll just refer you to the Snakemake documentation for now:
    # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards
    # `output` is made of 1 element, so we can just reference it using 
    # `{output}`. `{output[0]}` is also acceptable (the first number in python
    # is 0, not 1)

 # CSVs stink so let's transform it in a TSV
 rule format_penguin_table:
    # This time we have an input!
    # NOTE that the order of the rules doesn't matter, Snakemake will figure it
    # out on its own as long as inputs and outputs are unambiguous
    input:
        f"{SCRATCH}/intermediate_files/penguins_size.csv"
    output:
        f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv"
    # Multiline statement this time, with quotation marks
    shell:
        """
        tr "," "\\t" < {input}> {output}
        """

 rule filter_penguin_table:
    input:
        f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv"
    output:
        f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv"
        # Note that double curly brackets escapes them (they'll be actual curly 
        # brackets in the script, like in the case of awk)
        
        # Let's see which ones have flippers > 200 mm
    shell:
        """
        cat {input} | cut -f1,5,7 | awk '$2>200{{print $0}}' > {output}
        """ 

 # I have one last thing to show: let's split the table in 3 and use
 # named arguments
 rule split_penguin_table:
    input:
        # Even if it's just one file, if we use named arguments, they all need 
        # to be named
        file = f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv"
    output:
        first_row = f"{SCRATCH}/intermediate_files/misc/first_row.txt",
        #                                                            ^^^
        #                                                            |||
        #                                 REMEMBER THE COMMAS AT THE END
        gentoo = f"{SCRATCH}/intermediate_files/by_penguin/gentoo.tsv",
        adelie = f"{SCRATCH}/intermediate_files/by_penguin/adelie.tsv",
        chinstrap = f"{SCRATCH}/intermediate_files/by_penguin/chinstrap.tsv"
        # I won't stress enough how important the commas at the end of each
        # element are, they're one of the main source of headaches in Snakemake
        # together with tabs/whitespace inconsistencies

        # This is a simple way of splitting files, but there's a ton of ways
        # of doing that in Snakemake if you dig in the documentation
    shell:
        """
        head -n1 < {input.file} > {output.first_row}
        awk '$1=="Chinstrap"{{print $0}}' {input.file} > {output.chinstrap}
        awk '$1=="Gentoo"{{print $0}}' {input.file} > {output.gentoo}
        awk '$1=="Adelie"{{print $0}}' {input.file} > {output.adelie}
        """

 # I want females only, but I have three files. This is a great case for wildcards!
 rule filter_penguins_by_sex:
    input:
        # Wildcards are identified by {...}. Sounds familiar?
        # It's the same syntax we use for f-strings ( f"{...}" ).
        # We escape them in the same way using {{ }} 
        # We can then acces them as normal variables under wildcards.[...]
        f"{SCRATCH}/intermediate_files/by_penguin/{{species}}.tsv"
    output:
        f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv"
    shell:
        """
        awk '$3=="FEMALE"{{print $0}}' {input} > {output}
        """
 # Note how this is just one rule for all three file! We actually defined all
 # the possible values above, but this is not strictly necessary. We only do that because it's convenient with the next rule, but Snakemake can figure this out on its own.

 # Finally, we need to explain expand()
 # expand() is a function that allows to expand paths using variables and it's 
 # particularly useful when you want to aggregate multiple files with similar names (e.g. when merging them together or in your "all" rule)

 rule create_final_dataset:
    input:
        # Our first row
        f"{SCRATCH}/intermediate_files/misc/first_row.txt",
        # Everything else
        expand(
            f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv",
            species = species
        )
    output:
      f"{SCRATCH}/final_file.tsv" 
    shell:
        # Before you ask anything:
        # - All the arguments in input are passed in order
        # - All the arguments in input are passed one after the other
        #   e.g. path1 path2 path3 
        "cat {input} > {output}"
        # This is the same as writing
        #"cat {input[0]} {input[1]} {input[2]} {input[3]} > {output[0]}"

 # ... and this is it! Go check the output files in the same folder and
 # the .snakemake/ hidden folder!
	Crash course in Snakemake syntax by Giulio Centorame

	2025-03-22

	1. Save this Snakefile in any folder where you can generate a few text files
	2. Read the actual file
	3. Dry run with
	snakemake -n
	or see all the individual jobs with
	snakemake -np
	4. Run the workflow using
	snakemake -call
	5. Try to edit some of the commands and repeat from step 3.

	Hope this is useful!
	# Okay so here's an example Snakefile

	# Place it in an example folder and it will generate a bunch of files
	# for you

	# Importing packages works like normal python
	from snakemake.utils import min_version
	min_version("8.0")

	# Storing all the outputs here'
	#SCRATCH = "/PURR/MEOW/my_scratch/"
	# A more reasonable path
	SCRATCH = "outputs"

	# All the penguins species that I know of
	species = ["adelie", "chinstrap", "gentoo"]

	# The most important rule of all is

	rule all:
	input:
	f"{SCRATCH}/final_file.tsv"

	# The first rule of a workflow is the target rule (called "all" by
	# convention). ONLY the rules upstream of "all" will be run. Note that
	# all the paths in any rule are strings, so you can (should) use f-strings
	# to make the paths more readable

	# We're gonna define all the rules below, but you can add rules to
	# a Snakefile by using `include`
	#include: "my_rules.smk"

	# Let's define our first proper rule
	rule download_penguin_table:
	# This is a special rule, it just creates a file, you don't need inputs!
	output:
	f"{SCRATCH}/intermediate_files/penguins_size.csv"
	# I have a long path to a file to download, so I'd rather just shove it in a params
	# Those just get passed to the script when you reference them
	# e.g. see below
	params:
	URL = "https://raw.githubusercontent.com/dickoa/penguins/refs/heads/master/data/penguins_size.csv"
	# There are two types of shell directive:
	# "single line" and
	# """
	# multi
	# line
	# """
	# Multi-line shell directives allow the use of newline and quotation marks,
	#, otherwise they're identical.
	shell:
	"wget -nd -O {output} {params.URL}"
	# Anything in curly brackets will be replaced by Snakemake with either
	# a part of a rule (if using the words `input`, `output` etc)
	# or a wildcard. Wildcards are best described with a more elaborate
	# example, so I'll just refer you to the Snakemake documentation for now:
	# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards
	# `output` is made of 1 element, so we can just reference it using
	# `{output}`. `{output[0]}` is also acceptable (the first number in python
	# is 0, not 1)

	# CSVs stink so let's transform it in a TSV
	rule format_penguin_table:
	# This time we have an input!
	# NOTE that the order of the rules doesn't matter, Snakemake will figure it
	# out on its own as long as inputs and outputs are unambiguous
	input:
	f"{SCRATCH}/intermediate_files/penguins_size.csv"
	output:
	f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv"
	# Multiline statement this time, with quotation marks
	shell:
	"""
	tr "," "\\t" < {input}> {output}
	"""

	rule filter_penguin_table:
	input:
	f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv"
	output:
	f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv"
	# Note that double curly brackets escapes them (they'll be actual curly
	# brackets in the script, like in the case of awk)

	# Let's see which ones have flippers > 200 mm
	shell:
	"""
	cat {input} \| cut -f1,5,7 \| awk '$2>200{{print $0}}' > {output}
	"""

	# I have one last thing to show: let's split the table in 3 and use
	# named arguments
	rule split_penguin_table:
	input:
	# Even if it's just one file, if we use named arguments, they all need
	# to be named
	file = f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv"
	output:
	first_row = f"{SCRATCH}/intermediate_files/misc/first_row.txt",
	# ^^^
	# \|\|\|
	# REMEMBER THE COMMAS AT THE END
	gentoo = f"{SCRATCH}/intermediate_files/by_penguin/gentoo.tsv",
	adelie = f"{SCRATCH}/intermediate_files/by_penguin/adelie.tsv",
	chinstrap = f"{SCRATCH}/intermediate_files/by_penguin/chinstrap.tsv"
	# I won't stress enough how important the commas at the end of each
	# element are, they're one of the main source of headaches in Snakemake
	# together with tabs/whitespace inconsistencies

	# This is a simple way of splitting files, but there's a ton of ways
	# of doing that in Snakemake if you dig in the documentation
	shell:
	"""
	head -n1 < {input.file} > {output.first_row}
	awk '$1=="Chinstrap"{{print $0}}' {input.file} > {output.chinstrap}
	awk '$1=="Gentoo"{{print $0}}' {input.file} > {output.gentoo}
	awk '$1=="Adelie"{{print $0}}' {input.file} > {output.adelie}
	"""

	# I want females only, but I have three files. This is a great case for wildcards!
	rule filter_penguins_by_sex:
	input:
	# Wildcards are identified by {...}. Sounds familiar?
	# It's the same syntax we use for f-strings ( f"{...}" ).
	# We escape them in the same way using {{ }}
	# We can then acces them as normal variables under wildcards.[...]
	f"{SCRATCH}/intermediate_files/by_penguin/{{species}}.tsv"
	output:
	f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv"
	shell:
	"""
	awk '$3=="FEMALE"{{print $0}}' {input} > {output}
	"""
	# Note how this is just one rule for all three file! We actually defined all
	# the possible values above, but this is not strictly necessary. We only do that because it's convenient with the next rule, but Snakemake can figure this out on its own.

	# Finally, we need to explain expand()
	# expand() is a function that allows to expand paths using variables and it's
	# particularly useful when you want to aggregate multiple files with similar names (e.g. when merging them together or in your "all" rule)

	rule create_final_dataset:
	input:
	# Our first row
	f"{SCRATCH}/intermediate_files/misc/first_row.txt",
	# Everything else
	expand(
	f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv",
	species = species
	)
	output:
	f"{SCRATCH}/final_file.tsv"
	shell:
	# Before you ask anything:
	# - All the arguments in input are passed in order
	# - All the arguments in input are passed one after the other
	# e.g. path1 path2 path3
	"cat {input} > {output}"
	# This is the same as writing
	#"cat {input[0]} {input[1]} {input[2]} {input[3]} > {output[0]}"

	# ... and this is it! Go check the output files in the same folder and
	# the .snakemake/ hidden folder!