Created
March 21, 2025 14:24
-
-
Save GiulioCentorame/98dcaafbc0fd359659d9445fc23d4583 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Crash course in Snakemake syntax by Giulio Centorame | |
2025-03-22 | |
1. Save this Snakefile in any folder where you can generate a few text files | |
2. Read the actual file | |
3. Dry run with | |
snakemake -n | |
or see all the individual jobs with | |
snakemake -np | |
4. Run the workflow using | |
snakemake -call | |
5. Try to edit some of the commands and repeat from step 3. | |
Hope this is useful! |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Okay so here's an example Snakefile | |
# Place it in an example folder and it will generate a bunch of files | |
# for you | |
# Importing packages works like normal python | |
from snakemake.utils import min_version | |
min_version("8.0") | |
# Storing all the outputs here' | |
#SCRATCH = "/PURR/MEOW/my_scratch/" | |
# A more reasonable path | |
SCRATCH = "outputs" | |
# All the penguins species that I know of | |
species = ["adelie", "chinstrap", "gentoo"] | |
# The most important rule of all is | |
rule all: | |
input: | |
f"{SCRATCH}/final_file.tsv" | |
# The first rule of a workflow is the target rule (called "all" by | |
# convention). ONLY the rules upstream of "all" will be run. Note that | |
# all the paths in any rule are strings, so you can (should) use f-strings | |
# to make the paths more readable | |
# We're gonna define all the rules below, but you can add rules to | |
# a Snakefile by using `include` | |
#include: "my_rules.smk" | |
# Let's define our first proper rule | |
rule download_penguin_table: | |
# This is a special rule, it just creates a file, you don't need inputs! | |
output: | |
f"{SCRATCH}/intermediate_files/penguins_size.csv" | |
# I have a long path to a file to download, so I'd rather just shove it in a params | |
# Those just get passed to the script when you reference them | |
# e.g. see below | |
params: | |
URL = "https://raw.githubusercontent.com/dickoa/penguins/refs/heads/master/data/penguins_size.csv" | |
# There are two types of shell directive: | |
# "single line" and | |
# """ | |
# multi | |
# line | |
# """ | |
# Multi-line shell directives allow the use of newline and quotation marks, | |
#, otherwise they're identical. | |
shell: | |
"wget -nd -O {output} {params.URL}" | |
# Anything in curly brackets will be replaced by Snakemake with either | |
# a part of a rule (if using the words `input`, `output` etc) | |
# or a wildcard. Wildcards are best described with a more elaborate | |
# example, so I'll just refer you to the Snakemake documentation for now: | |
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#wildcards | |
# `output` is made of 1 element, so we can just reference it using | |
# `{output}`. `{output[0]}` is also acceptable (the first number in python | |
# is 0, not 1) | |
# CSVs stink so let's transform it in a TSV | |
rule format_penguin_table: | |
# This time we have an input! | |
# NOTE that the order of the rules doesn't matter, Snakemake will figure it | |
# out on its own as long as inputs and outputs are unambiguous | |
input: | |
f"{SCRATCH}/intermediate_files/penguins_size.csv" | |
output: | |
f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv" | |
# Multiline statement this time, with quotation marks | |
shell: | |
""" | |
tr "," "\\t" < {input}> {output} | |
""" | |
rule filter_penguin_table: | |
input: | |
f"{SCRATCH}/intermediate_files/penguins_size_formatted.tsv" | |
output: | |
f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv" | |
# Note that double curly brackets escapes them (they'll be actual curly | |
# brackets in the script, like in the case of awk) | |
# Let's see which ones have flippers > 200 mm | |
shell: | |
""" | |
cat {input} | cut -f1,5,7 | awk '$2>200{{print $0}}' > {output} | |
""" | |
# I have one last thing to show: let's split the table in 3 and use | |
# named arguments | |
rule split_penguin_table: | |
input: | |
# Even if it's just one file, if we use named arguments, they all need | |
# to be named | |
file = f"{SCRATCH}/intermediate_files/penguins_size_filtered.tsv" | |
output: | |
first_row = f"{SCRATCH}/intermediate_files/misc/first_row.txt", | |
# ^^^ | |
# ||| | |
# REMEMBER THE COMMAS AT THE END | |
gentoo = f"{SCRATCH}/intermediate_files/by_penguin/gentoo.tsv", | |
adelie = f"{SCRATCH}/intermediate_files/by_penguin/adelie.tsv", | |
chinstrap = f"{SCRATCH}/intermediate_files/by_penguin/chinstrap.tsv" | |
# I won't stress enough how important the commas at the end of each | |
# element are, they're one of the main source of headaches in Snakemake | |
# together with tabs/whitespace inconsistencies | |
# This is a simple way of splitting files, but there's a ton of ways | |
# of doing that in Snakemake if you dig in the documentation | |
shell: | |
""" | |
head -n1 < {input.file} > {output.first_row} | |
awk '$1=="Chinstrap"{{print $0}}' {input.file} > {output.chinstrap} | |
awk '$1=="Gentoo"{{print $0}}' {input.file} > {output.gentoo} | |
awk '$1=="Adelie"{{print $0}}' {input.file} > {output.adelie} | |
""" | |
# I want females only, but I have three files. This is a great case for wildcards! | |
rule filter_penguins_by_sex: | |
input: | |
# Wildcards are identified by {...}. Sounds familiar? | |
# It's the same syntax we use for f-strings ( f"{...}" ). | |
# We escape them in the same way using {{ }} | |
# We can then acces them as normal variables under wildcards.[...] | |
f"{SCRATCH}/intermediate_files/by_penguin/{{species}}.tsv" | |
output: | |
f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv" | |
shell: | |
""" | |
awk '$3=="FEMALE"{{print $0}}' {input} > {output} | |
""" | |
# Note how this is just one rule for all three file! We actually defined all | |
# the possible values above, but this is not strictly necessary. We only do that because it's convenient with the next rule, but Snakemake can figure this out on its own. | |
# Finally, we need to explain expand() | |
# expand() is a function that allows to expand paths using variables and it's | |
# particularly useful when you want to aggregate multiple files with similar names (e.g. when merging them together or in your "all" rule) | |
rule create_final_dataset: | |
input: | |
# Our first row | |
f"{SCRATCH}/intermediate_files/misc/first_row.txt", | |
# Everything else | |
expand( | |
f"{SCRATCH}/intermediate_files/by_penguin/{{species}}_filtered.tsv", | |
species = species | |
) | |
output: | |
f"{SCRATCH}/final_file.tsv" | |
shell: | |
# Before you ask anything: | |
# - All the arguments in input are passed in order | |
# - All the arguments in input are passed one after the other | |
# e.g. path1 path2 path3 | |
"cat {input} > {output}" | |
# This is the same as writing | |
#"cat {input[0]} {input[1]} {input[2]} {input[3]} > {output[0]}" | |
# ... and this is it! Go check the output files in the same folder and | |
# the .snakemake/ hidden folder! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment