Last active
June 26, 2018 18:34
-
-
Save conchoecia/d48da395d7b5d48a9c16c5eb6a9874e3 to your computer and use it in GitHub Desktop.
Snakemake Trim adapters and quality-trim 10X Chromium data from HiSeq.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import os | |
""" | |
snakemake interprets some output files as being ambiguous since the processing pipeline is not a DAG. | |
as a results you must use --allow-ambiguity | |
example usage: | |
snakemake --cores 90 | |
""" | |
# vvvv LOOK HERE vvv | |
# | |
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa | |
# goes here | |
trimmomatic = "/usr/local/bin/Trimmomatic-0.35" | |
maxthreads = 90 | |
# | |
# | |
# ^^^^ LOOK HERE ^^^ | |
filenames = [os.path.basename(f) for f in glob.glob("*_R*_*.fastq.gz")] | |
pairdict = {x.replace("_R1", "").replace("_R2", "").replace(".fastq.gz",""):[] | |
for x in filenames} | |
for sample in filenames: | |
pairdict[sample.replace("_R1", "").replace("_R2", "").replace(".fastq.gz","")].append(sample) | |
for key in pairdict: | |
pairdict[key] = sorted(pairdict[key]) | |
samples = list(pairdict.keys()) | |
#make symlinks to more easily process the links | |
for key in pairdict: | |
for i in range(len(pairdict[key])): | |
if i == 0: | |
output_filepath = "{}_1.fastq.gz".format(key) | |
elif i == 1: | |
output_filepath = "{}_2.fastq.gz".format(key) | |
if os.path.isfile(output_filepath): | |
os.remove(output_filepath) | |
shell("ln -sr {0} {1}".format(pairdict[key][i], output_filepath)) | |
rule all: | |
input: | |
expand("trimmed/{sample}_1.trim.final.fastq.gz", sample=samples), | |
expand("trimmed/{sample}_2.trim.final.fastq.gz", sample=samples), | |
expand("trimmed/{sample}_1.trim.unpaired.fastq.gz", sample=samples), | |
expand("trimmed/{sample}_2.trim.unpaired.fastq.gz", sample=samples) | |
rule trim_forward: | |
input: | |
fastq = "{sample}_1.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar") | |
output: | |
temp("trimmed/{sample}_1.trim.fastq.gz") | |
threads: | |
maxthreads | |
message: | |
"Trimming the first 24 bases from {input.fastq}" | |
shell: | |
"""java -jar {input.trim_jar} SE -threads {threads} \ | |
{input.fastq} {output} HEADCROP:24""" | |
rule trim_reverse: | |
input: | |
fastq = "{sample}_2.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar") | |
output: | |
temp("trimmed/{sample}_2.trim.fastq.gz") | |
threads: | |
maxthreads | |
message: | |
"Trimming the first 3 bases from {input.fastq}" | |
shell: | |
"""java -jar {input.trim_jar} SE -threads {threads} \ | |
{input.fastq} {output} HEADCROP:3""" | |
rule trim_pairs: | |
input: | |
f1 = "trimmed/{sample}_1.trim.fastq.gz", | |
f2 = "trimmed/{sample}_2.trim.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"), | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa") | |
output: | |
f_paired = "trimmed/{sample}_1.trim.final.fastq.gz", | |
r_paired = "trimmed/{sample}_2.trim.final.fastq.gz", | |
f_unpaired = "trimmed/{sample}_1.trim.unpaired.fastq.gz", | |
r_unpaired = "trimmed/{sample}_2.trim.unpaired.fastq.gz" | |
threads: | |
maxthreads | |
message: | |
"Trimming {input.f1} and {input.f2} as a pair." | |
shell: | |
"""java -jar {input.trim_jar} PE \ | |
-threads {threads} \ | |
-phred33 {input.f1} {input.f2} \ | |
{output.f_paired} \ | |
{output.f_unpaired} \ | |
{output.r_paired} \ | |
{output.r_unpaired} \ | |
ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE\ | |
LEADING:3 TRAILING:3 \ | |
SLIDINGWINDOW:4:15 MINLEN:36""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment