samuell · June 14, 2018 15:11
diff --git a/caw_scipipe_yamlexperiment.yml b/caw_scipipe_yamlexperiment.yml
 # --------------------------------------------------------------------------------------------
 # Quick syntax intro
 # --------------------------------------------------------------------------------------------
 # Process definitions:
 # Processes can be defined as "file:" or "shell:" type.
 # - The "shell:" type takes a command pattern with the following special syntax:
 # - {i:portname} defines an in-port (for feeding inputs from upstream processes)
 # - {o:portname} defines an out-port (where outputs will be sent to connected downstream processes)
 # - {p:portname} defines a parameter port, where parameters can be fed as a stream of strings
 # Connections:
 # - Connections are defined under the "input_connections:" section of each process.
 # - One connection is defined per in-port occuring in the shell pattern on this form:
 #   <inport_name>: <upstream_process>.<outport_name> 
 # Output file naming:
 # - Output file names are deinfed under the "output_paths:" 
 # File-type processes:
 # Processes that are defined using the "file:" syntax, are basically just a queue that
 # sends a number of files defined by a pattern, and some repeating values.
 # - In this example, bash-style syntax ({a,b,c}) is used for creating multiple file paths based
 #   on varying values.
 # --------------------------------------------------------------------------------------------
 # Set up some static variables
 # --------------------------------------------------------------------------------------------
 - variables:
  - appsdir: data/apps
  - datadir: data
  - refdir: appsdir + '/pipeline_test/ref'
  - tmpdir: tmp
  - reffasta: refdir + '/human_g1k_v37_decoy.fasta'
  - refindex: refdir + '/human_g1k_v37_decoy.fasta.fai'
 # --------------------------------------------------------------------------------------------
 # Define the processes that make up the workflow
 # --------------------------------------------------------------------------------------------
 - processes:
  # --------------------------------------------------------------------------------
  # Get rawdata
  # --------------------------------------------------------------------------------
  # Download
  - name: download_rawdata
    shell: wget http://uppnex.se/apps.tar.gz -O {o:apps}
    output_paths:
      - apps: datadir + '/uppnex_apps.tar.gz'
  # Unzip
  - name: unzip_rawdata
    shell: zcat {i:targz} > {o:tar}
    input_connections:
      # Connect to the "apps" out-port of the "download_rawdata" process
      - targz: download_rawdata.apps
    output_paths:
      # Define output path pattern for the "targz" out-port
      - tar: {i:targz%.gz}
  # Untar
  - name: untar_rawdata
    shell: tar -xvf {i:tar} -C datadir # {o:outdir}
    input_connections:
      # Connect to the "targz" out-port of the "unzip_rawdata" process
      - unzip_rawdata.targz
    output_paths:
      - outdir: datadir + '/apps'
  # --------------------------------------------------------------------------------
  # Raw data files
  # --------------------------------------------------------------------------------
  - name: reads_normal_1
    file: datadir + "/tiny_normal_L00"{1,2,4,7,8}"_R1.fasta.gq"
  - name: reads_normal_2
    file: datadir + "/tiny_normal_L00"{1,2,3,5,6,7}"_R1.fasta.gq"
  - name: reads_tumor_1
    file: datadir + "/tiny_tumor_L00"{1,2,4,7,8}"_R1.fasta.gq"
  - name: reads_tumor_2
    file: datadir + "/tiny_tumor_L00"{1,2,3,5,6,7}"_R1.fasta.gq"
  # --------------------------------------------------------------------------------
  # Align reads
  # --------------------------------------------------------------------------------
  - name: align_reads
    # Define via shell command pattern
    shell: >
        bwa mem -R "..." -B 3 -t 4 -M {reffasta} {i:reads1} {i:reads1}'
        | samtools view -bS -t {refindex} -
        | samtools sort - > {o:bam} # {i:appsdir}
    # Connect inputs
    input_connections:
      #         Just below here, we are concatenating two file-processes 
      #         (they don't have an out-port here, since they have just a single output by design)
      - reads1: reads_normal_1,reads_tumor_1
      - reads2: reads_normal_2,reads_tumor_2
      - appsdir: untar_rawdata.outdir
    output_paths:
      - bam: tmpdir + '/{p:readskind}_{p:index}.bam'
  # --------------------------------------------------------------------------------
  # Merge BAMs
  # --------------------------------------------------------------------------------
  - name: merge_bams
    # The "readskind" parameter is here not used in the command itself. It is therefore
    # placed in a comment after the command.
    shell: 'samtools merge -f {o:merged} {i:bams:r: } # {p:readskind}'
    input_connections:
      # Connect to the "bam" out-port of the "align_reads" process
      - bams: align_reads.bam
    output_paths:
      - merged: tmp+'/{p:readskind}.bam'
	# --------------------------------------------------------------------------------------------
	# Quick syntax intro
	# --------------------------------------------------------------------------------------------
	# Process definitions:
	# Processes can be defined as "file:" or "shell:" type.
	# - The "shell:" type takes a command pattern with the following special syntax:
	# - {i:portname} defines an in-port (for feeding inputs from upstream processes)
	# - {o:portname} defines an out-port (where outputs will be sent to connected downstream processes)
	# - {p:portname} defines a parameter port, where parameters can be fed as a stream of strings
	# Connections:
	# - Connections are defined under the "input_connections:" section of each process.
	# - One connection is defined per in-port occuring in the shell pattern on this form:
	# <inport_name>: <upstream_process>.<outport_name>
	# Output file naming:
	# - Output file names are deinfed under the "output_paths:"
	# File-type processes:
	# Processes that are defined using the "file:" syntax, are basically just a queue that
	# sends a number of files defined by a pattern, and some repeating values.
	# - In this example, bash-style syntax ({a,b,c}) is used for creating multiple file paths based
	# on varying values.
	# --------------------------------------------------------------------------------------------
	# Set up some static variables
	# --------------------------------------------------------------------------------------------
	- variables:
	- appsdir: data/apps
	- datadir: data
	- refdir: appsdir + '/pipeline_test/ref'
	- tmpdir: tmp
	- reffasta: refdir + '/human_g1k_v37_decoy.fasta'
	- refindex: refdir + '/human_g1k_v37_decoy.fasta.fai'
	# --------------------------------------------------------------------------------------------
	# Define the processes that make up the workflow
	# --------------------------------------------------------------------------------------------
	- processes:
	# --------------------------------------------------------------------------------
	# Get rawdata
	# --------------------------------------------------------------------------------
	# Download
	- name: download_rawdata
	shell: wget http://uppnex.se/apps.tar.gz -O {o:apps}
	output_paths:
	- apps: datadir + '/uppnex_apps.tar.gz'
	# Unzip
	- name: unzip_rawdata
	shell: zcat {i:targz} > {o:tar}
	input_connections:
	# Connect to the "apps" out-port of the "download_rawdata" process
	- targz: download_rawdata.apps
	output_paths:
	# Define output path pattern for the "targz" out-port
	- tar: {i:targz%.gz}
	# Untar
	- name: untar_rawdata
	shell: tar -xvf {i:tar} -C datadir # {o:outdir}
	input_connections:
	# Connect to the "targz" out-port of the "unzip_rawdata" process
	- unzip_rawdata.targz
	output_paths:
	- outdir: datadir + '/apps'
	# --------------------------------------------------------------------------------
	# Raw data files
	# --------------------------------------------------------------------------------
	- name: reads_normal_1
	file: datadir + "/tiny_normal_L00"{1,2,4,7,8}"_R1.fasta.gq"
	- name: reads_normal_2
	file: datadir + "/tiny_normal_L00"{1,2,3,5,6,7}"_R1.fasta.gq"
	- name: reads_tumor_1
	file: datadir + "/tiny_tumor_L00"{1,2,4,7,8}"_R1.fasta.gq"
	- name: reads_tumor_2
	file: datadir + "/tiny_tumor_L00"{1,2,3,5,6,7}"_R1.fasta.gq"
	# --------------------------------------------------------------------------------
	# Align reads
	# --------------------------------------------------------------------------------
	- name: align_reads
	# Define via shell command pattern
	shell: >
	bwa mem -R "..." -B 3 -t 4 -M {reffasta} {i:reads1} {i:reads1}'
	\| samtools view -bS -t {refindex} -
	\| samtools sort - > {o:bam} # {i:appsdir}
	# Connect inputs
	input_connections:
	# Just below here, we are concatenating two file-processes
	# (they don't have an out-port here, since they have just a single output by design)
	- reads1: reads_normal_1,reads_tumor_1
	- reads2: reads_normal_2,reads_tumor_2
	- appsdir: untar_rawdata.outdir
	output_paths:
	- bam: tmpdir + '/{p:readskind}_{p:index}.bam'
	# --------------------------------------------------------------------------------
	# Merge BAMs
	# --------------------------------------------------------------------------------
	- name: merge_bams
	# The "readskind" parameter is here not used in the command itself. It is therefore
	# placed in a comment after the command.
	shell: 'samtools merge -f {o:merged} {i:bams:r: } # {p:readskind}'
	input_connections:
	# Connect to the "bam" out-port of the "align_reads" process
	- bams: align_reads.bam
	output_paths:
	- merged: tmp+'/{p:readskind}.bam'