Last active
April 13, 2018 13:24
-
-
Save conchoecia/a35fcb654ae8decd132224b6a5c53eef to your computer and use it in GitHub Desktop.
This gist takes a list of directories that contain fastq files and trims them, runs fastqc, and outputs a QC report with info on trimming efficiency and Hi-C linker sequence content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pathlib import Path | |
import yaml | |
configfile: "directories.yaml" | |
# The yaml file should look like this below. Just an object called "directories" | |
# and a list of directories in which the pipeline should look for things. | |
# | |
# directories: | |
# - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/" | |
# - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/" | |
# - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/" | |
# vvvv LOOK HERE vvv | |
# | |
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa | |
# goes here | |
trimmomatic = "/usr/local/bin/Trimmomatic-0.35" | |
ntpath = "/data/ncbi/db/nt" | |
# | |
# | |
# ^^^^ LOOK HERE ^^^ | |
def is_fastq(filepath): | |
suffix_set = set(Path(filepath).suffixes) | |
fastqs = set([".fastq", ".fq"]) | |
fastq_intersection = suffix_set & fastqs | |
if len(fastq_intersection) > 0: | |
return True | |
else: | |
return False | |
#now we must populate the sample list | |
config["samples"] = {} | |
for this_d in config["directories"]: | |
these_files = os.listdir(this_d) | |
fastqs = [filepath for filepath in these_files if is_fastq(filepath)] | |
samplelist = set() | |
for filepath in fastqs: | |
samplelist.add(filepath.split("_")[0]) | |
for sample in samplelist: | |
sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath]) | |
f = os.path.join(this_d, sample_fastqs[0]) | |
r = os.path.join(this_d, sample_fastqs[1]) | |
if sample in config["samples"]: | |
raise IOError("""Library {} was found in more than two directories. | |
Please only include one set of readpairs per library: | |
- {} | |
- {}""".format(sample, this_d, config["samples"][sample]["f"])) | |
else: | |
config["samples"][sample] = {} | |
config["samples"][sample]["f"] = f | |
config["samples"][sample]["r"] = r | |
rule all: | |
input: | |
expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]), | |
expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]), | |
expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]), | |
expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]), | |
expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]), | |
expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]), | |
"report/final_report.txt" | |
rule make_fake_reads: | |
input: | |
[config["samples"][sample][readdir] \ | |
for readdir in ['f', 'r'] \ | |
for sample in config["samples"]] | |
output: | |
forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])), | |
reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"])) | |
message: | |
"making soft links" | |
run: | |
for this_sample in sorted(config["samples"]): | |
for this_direction in ["f", "r"]: | |
os.symlink(config["samples"][this_sample][this_direction], | |
"reads/{}_{}.fastq.gz".format(this_sample, this_direction)) | |
rule trim_pairs: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"), | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa") | |
output: | |
f_paired = "trimmed/{sample}_f.trim.fastq.gz", | |
r_paired = "trimmed/{sample}_r.trim.fastq.gz", | |
f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"), | |
r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"), | |
trimlog = "trimmed/{sample}_trimlog.txt" | |
threads: | |
60 | |
message: | |
"adapter and quality trimming files" | |
shell: | |
"""java -jar {input.trim_jar} PE \ | |
-phred33 -threads {threads} \ | |
-trimlog {output.trimlog}\ | |
{input.f1} {input.f2} \ | |
{output.f_paired} \ | |
{output.f_unpaired} \ | |
{output.r_paired} \ | |
{output.r_unpaired} \ | |
ILLUMINACLIP:{input.adapter_path}:2:30:10 \ | |
LEADING:3 TRAILING:3 \ | |
SLIDINGWINDOW:4:15 MINLEN:36""" | |
rule num_reads: | |
input: | |
forward_trim = "trimmed/{sample}_f.trim.fastq.gz", | |
f1 = "reads/{sample}_f.fastq.gz" | |
output: | |
numreads = "report/numreads/{sample}.numreads.txt", | |
trimmed = "report/numreads/{sample}.trimmed.numreads.txt" | |
shell: | |
"""echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \ | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}""" | |
rule library_efficiency: | |
input: | |
numreads = "report/numreads/{sample}.numreads.txt", | |
trimmed = "report/numreads/{sample}.trimmed.numreads.txt" | |
output: | |
"report/efficiency/{sample}.efficiency.txt" | |
message: | |
"calculating the library efficiency" | |
shell: | |
"""cat {input.trimmed} {input.numreads} | \ | |
paste - - | \ | |
awk '{{print $1/$2}}' > {output} """ | |
rule fastq_to_fasta_fwd: | |
input: | |
forward = "trimmed/{sample}_f.trim.fastq.gz" | |
output: | |
temp("trimmed/{sample}_f.fasta") | |
message: | |
"Converting fastq forward file to fasta" | |
shell: | |
"seqtk seq -A {input.forward} > {output}" | |
rule sample_fasta: | |
input: | |
forward = "trimmed/{sample}_f.fasta" | |
output: | |
temp("trimmed/{sample}_f.subsample.fasta") | |
message: | |
"Sampling up to 100k reads from the fasta" | |
shell: | |
"""NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\ | |
if [ $NUMREADS > 100000 ]; then\ | |
echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \ | |
seqtk sample -s100 {input.forward} 100000 > {output};\ | |
else\ | |
echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\ | |
cp {input.forward} {output};\ | |
fi | |
""" | |
rule blast_sample: | |
input: | |
forward = "trimmed/{sample}_f.subsample.fasta" | |
output: | |
"report/blast/{sample}_f.blast.txt" | |
message: | |
"BLAST the reads" | |
params: | |
pntpath = ntpath | |
threads: | |
96 | |
shell: | |
"""blastn -query {input.forward} \ | |
-db {params.pntpath} -outfmt 6 \ | |
-out {output} -num_threads {threads}""" | |
rule fastqc_raw: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa") | |
output: | |
"fastqc/raw/{sample}_f_fastqc.html", | |
"fastqc/raw/{sample}_f_fastqc.zip", | |
"fastqc/raw/{sample}_r_fastqc.html", | |
"fastqc/raw/{sample}_r_fastqc.zip" | |
message: | |
"making a fastqc report of the raw reads" | |
params: | |
raw_dir = "fastqc/raw/" | |
shell: | |
"""fastqc -a {input.adapter_path} \ | |
-o {params.raw_dir} \ | |
{input.f1} {input.f2}""" | |
rule fastqc_trimmed: | |
input: | |
f1 = "trimmed/{sample}_f.trim.fastq.gz", | |
f2 = "trimmed/{sample}_r.trim.fastq.gz", | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa") | |
output: | |
"fastqc/trimmed/{sample}_f.trim_fastqc.html", | |
"fastqc/trimmed/{sample}_f.trim_fastqc.zip", | |
"fastqc/trimmed/{sample}_r.trim_fastqc.html", | |
"fastqc/trimmed/{sample}_r.trim_fastqc.zip" | |
message: | |
"making a fastqc report of the trimmed reads" | |
params: | |
raw_dir = "fastqc/trimmed/" | |
shell: | |
"""fastqc -a {input.adapter_path} \ | |
-o {params.raw_dir} \ | |
{input.f1} {input.f2}""" | |
rule GATCGATC_f: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
f1t = "trimmed/{sample}_f.trim.fastq.gz", | |
f2t = "trimmed/{sample}_r.trim.fastq.gz" | |
output: | |
f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", | |
f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", | |
f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", | |
f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count" | |
shell: | |
"set +o pipefail; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC' - | wc -l > {output.f1o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC' - | wc -l > {output.f2o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}" | |
rule AATTAATT_num: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
f1t = "trimmed/{sample}_f.trim.fastq.gz", | |
f2t = "trimmed/{sample}_r.trim.fastq.gz" | |
output: | |
f1o = "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", | |
f2o = "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", | |
f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", | |
f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count" | |
shell: | |
"set +o pipefail; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT' - | wc -l > {output.f1o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT' - | wc -l > {output.f2o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}" | |
def read_number_from_file(filename): | |
with open(filename, "r") as f: | |
for line in f: | |
if line.strip(): | |
return line.strip() | |
rule collate_report: | |
""" this method prints out the final qc report of all the libraries.""" | |
input: | |
raw_num = expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]), | |
trim_num = expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]), | |
efficiency = expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]), | |
f1dpn = expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]), | |
f2dpn = expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]), | |
f1mluc = expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]), | |
f2mluc = expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]), | |
f1tdpn = expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]), | |
f2tdpn = expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]), | |
f1tmluc = expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]), | |
f2tmluc = expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"]) | |
output: | |
"report/final_report.txt" | |
run: | |
finalout_handle = open(output[0], "w") | |
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
"libname", | |
"num_reads", | |
"num_reads_trimmed", | |
"trim_efficiency", | |
"f_GATCGATC", | |
"r_GATCGATC", | |
"f_AATTAATT", | |
"r_AATTAATT", | |
"f_trim_GATCGATC", | |
"r_trim_GATCGATC", | |
"f_trim_AATTAATT", | |
"r_trim_AATTAATT"), | |
file = finalout_handle) | |
for thiss in sorted(config["samples"]): | |
num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss)) | |
nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss)) | |
trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss)) | |
f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss)) | |
r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss)) | |
f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss)) | |
r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss)) | |
f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss)) | |
r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss)) | |
f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss)) | |
r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss)) | |
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
thiss, | |
num_reads, | |
nreads_tr, | |
trim_effi, | |
int(f_GATCGAT)/int(num_reads), | |
int(r_GATCGAT)/int(num_reads), | |
int(f_AATTAAT)/int(num_reads), | |
int(r_AATTAAT)/int(num_reads), | |
int(f_trim_GA)/int(nreads_tr), | |
int(r_trim_GA)/int(nreads_tr), | |
int(f_trim_AA)/int(nreads_tr), | |
int(r_trim_AA)/int(nreads_tr)), file = finalout_handle) | |
finalout_handle.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pathlib import Path | |
import yaml | |
configfile: "directories.yaml" | |
# The yaml file should look like this below. Just an object called "directories" | |
# and a list of directories in which the pipeline should look for things. | |
# | |
# directories: | |
# - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/" | |
# - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/" | |
# - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/" | |
# vvvv LOOK HERE vvv | |
# | |
# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa | |
# goes here | |
trimmomatic = "/usr/local/bin/Trimmomatic-0.35" | |
ntpath = "/data/ncbi/db/nt" | |
# | |
# | |
# ^^^^ LOOK HERE ^^^ | |
def is_fastq(filepath): | |
suffix_set = set(Path(filepath).suffixes) | |
fastqs = set([".fastq", ".fq"]) | |
fastq_intersection = suffix_set & fastqs | |
if len(fastq_intersection) > 0: | |
return True | |
else: | |
return False | |
#now we must populate the sample list | |
config["samples"] = {} | |
for this_d in config["directories"]: | |
these_files = os.listdir(this_d) | |
fastqs = [filepath for filepath in these_files if is_fastq(filepath)] | |
samplelist = set() | |
for filepath in fastqs: | |
samplelist.add(filepath.split("_")[0]) | |
for sample in samplelist: | |
sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath]) | |
f = os.path.join(this_d, sample_fastqs[0]) | |
r = os.path.join(this_d, sample_fastqs[1]) | |
if sample in config["samples"]: | |
raise IOError("""Library {} was found in more than two directories. | |
Please only include one set of readpairs per library: | |
- {} | |
- {}""".format(sample, this_d, config["samples"][sample]["f"])) | |
else: | |
config["samples"][sample] = {} | |
config["samples"][sample]["f"] = f | |
config["samples"][sample]["r"] = r | |
rule all: | |
input: | |
expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]), | |
expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]), | |
expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]), | |
expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]), | |
expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]), | |
expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]), | |
expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]), | |
expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]), | |
expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]), | |
"report/final_report.txt" | |
rule make_fake_reads: | |
input: | |
[config["samples"][sample][readdir] \ | |
for readdir in ['f', 'r'] \ | |
for sample in config["samples"]] | |
output: | |
forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])), | |
reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"])) | |
message: | |
"making soft links" | |
run: | |
for this_sample in sorted(config["samples"]): | |
for this_direction in ["f", "r"]: | |
os.symlink(config["samples"][this_sample][this_direction], | |
"reads/{}_{}.fastq.gz".format(this_sample, this_direction)) | |
rule trim_pairs: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"), | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa") | |
output: | |
f_paired = "trimmed/{sample}_f.trim.fastq.gz", | |
r_paired = "trimmed/{sample}_r.trim.fastq.gz", | |
f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"), | |
r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"), | |
trimlog = "trimmed/{sample}_trimlog.txt" | |
threads: | |
60 | |
message: | |
"adapter and quality trimming files" | |
shell: | |
"""java -jar {input.trim_jar} PE \ | |
-phred33 -threads {threads} \ | |
-trimlog {output.trimlog}\ | |
{input.f1} {input.f2} \ | |
{output.f_paired} \ | |
{output.f_unpaired} \ | |
{output.r_paired} \ | |
{output.r_unpaired} \ | |
ILLUMINACLIP:{input.adapter_path}:2:30:10 \ | |
LEADING:3 TRAILING:3 \ | |
SLIDINGWINDOW:4:15 MINLEN:36""" | |
rule num_reads: | |
input: | |
forward_trim = "trimmed/{sample}_f.trim.fastq.gz", | |
f1 = "reads/{sample}_f.fastq.gz" | |
output: | |
numreads = "report/numreads/{sample}.numreads.txt", | |
trimmed = "report/numreads/{sample}.trimmed.numreads.txt" | |
shell: | |
"""echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \ | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}""" | |
rule library_efficiency: | |
input: | |
numreads = "report/numreads/{sample}.numreads.txt", | |
trimmed = "report/numreads/{sample}.trimmed.numreads.txt" | |
output: | |
"report/efficiency/{sample}.efficiency.txt" | |
message: | |
"calculating the library efficiency" | |
shell: | |
"""cat {input.trimmed} {input.numreads} | \ | |
paste - - | \ | |
awk '{{print $1/$2}}' > {output} """ | |
rule fastq_to_fasta_fwd: | |
input: | |
forward = "trimmed/{sample}_f.trim.fastq.gz" | |
output: | |
temp("trimmed/{sample}_f.fasta") | |
message: | |
"Converting fastq forward file to fasta" | |
shell: | |
"seqtk seq -A {input.forward} > {output}" | |
rule sample_fasta: | |
input: | |
forward = "trimmed/{sample}_f.fasta" | |
output: | |
temp("trimmed/{sample}_f.subsample.fasta") | |
message: | |
"Sampling up to 100k reads from the fasta" | |
shell: | |
"""NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\ | |
if [ $NUMREADS > 100000 ]; then\ | |
echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \ | |
seqtk sample -s100 {input.forward} 100000 > {output};\ | |
else\ | |
echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\ | |
cp {input.forward} {output};\ | |
fi | |
""" | |
rule blast_sample: | |
input: | |
forward = "trimmed/{sample}_f.subsample.fasta" | |
output: | |
"report/blast/{sample}_f.blast.txt" | |
message: | |
"BLAST the reads" | |
params: | |
pntpath = ntpath | |
threads: | |
96 | |
shell: | |
"""blastn -query {input.forward} \ | |
-db {params.pntpath} -outfmt 6 \ | |
-out {output} -num_threads {threads}""" | |
rule fastqc_raw: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa") | |
output: | |
"fastqc/raw/{sample}_f_fastqc.html", | |
"fastqc/raw/{sample}_f_fastqc.zip", | |
"fastqc/raw/{sample}_r_fastqc.html", | |
"fastqc/raw/{sample}_r_fastqc.zip" | |
message: | |
"making a fastqc report of the raw reads" | |
params: | |
raw_dir = "fastqc/raw/" | |
shell: | |
"""fastqc -a {input.adapter_path} \ | |
-o {params.raw_dir} \ | |
{input.f1} {input.f2}""" | |
rule fastqc_trimmed: | |
input: | |
f1 = "trimmed/{sample}_f.trim.fastq.gz", | |
f2 = "trimmed/{sample}_r.trim.fastq.gz", | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa") | |
output: | |
"fastqc/trimmed/{sample}_f.trim_fastqc.html", | |
"fastqc/trimmed/{sample}_f.trim_fastqc.zip", | |
"fastqc/trimmed/{sample}_r.trim_fastqc.html", | |
"fastqc/trimmed/{sample}_r.trim_fastqc.zip" | |
message: | |
"making a fastqc report of the trimmed reads" | |
params: | |
raw_dir = "fastqc/trimmed/" | |
shell: | |
"""fastqc -a {input.adapter_path} \ | |
-o {params.raw_dir} \ | |
{input.f1} {input.f2}""" | |
rule GATCGATC_f: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
f1t = "trimmed/{sample}_f.trim.fastq.gz", | |
f2t = "trimmed/{sample}_r.trim.fastq.gz" | |
output: | |
f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", | |
f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", | |
f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", | |
f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count" | |
shell: | |
"set +o pipefail; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC' - | wc -l > {output.f1o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC' - | wc -l > {output.f2o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}" | |
rule AATTAATT_num: | |
input: | |
f1 = "reads/{sample}_f.fastq.gz", | |
f2 = "reads/{sample}_r.fastq.gz", | |
f1t = "trimmed/{sample}_f.trim.fastq.gz", | |
f2t = "trimmed/{sample}_r.trim.fastq.gz" | |
output: | |
f1o = "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", | |
f2o = "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", | |
f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", | |
f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count" | |
shell: | |
"set +o pipefail; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT' - | wc -l > {output.f1o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT' - | wc -l > {output.f2o}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; " | |
"bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}" | |
def read_number_from_file(filename): | |
with open(filename, "r") as f: | |
for line in f: | |
if line.strip(): | |
return line.strip() | |
rule collate_report: | |
""" this method prints out the final qc report of all the libraries.""" | |
input: | |
raw_num = expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]), | |
trim_num = expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]), | |
efficiency = expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]), | |
f1dpn = expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]), | |
f2dpn = expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]), | |
f1mluc = expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]), | |
f2mluc = expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]), | |
f1tdpn = expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]), | |
f2tdpn = expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]), | |
f1tmluc = expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]), | |
f2tmluc = expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"]) | |
output: | |
"report/final_report.txt" | |
run: | |
finalout_handle = open(output[0], "w") | |
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
"libname", | |
"num_reads", | |
"num_reads_trimmed", | |
"trim_efficiency", | |
"f_GATCGATC", | |
"r_GATCGATC", | |
"f_AATTAATT", | |
"r_AATTAATT", | |
"f_trim_GATCGATC", | |
"r_trim_GATCGATC", | |
"f_trim_AATTAATT", | |
"r_trim_AATTAATT"), | |
file = finalout_handle) | |
for thiss in sorted(config["samples"]): | |
num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss)) | |
nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss)) | |
trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss)) | |
f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss)) | |
r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss)) | |
f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss)) | |
r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss)) | |
f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss)) | |
r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss)) | |
f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss)) | |
r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss)) | |
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
thiss, | |
num_reads, | |
nreads_tr, | |
trim_effi, | |
int(f_GATCGAT)/int(num_reads), | |
int(r_GATCGAT)/int(num_reads), | |
int(f_AATTAAT)/int(num_reads), | |
int(r_AATTAAT)/int(num_reads), | |
int(f_trim_GA)/int(nreads_tr), | |
int(r_trim_GA)/int(nreads_tr), | |
int(f_trim_AA)/int(nreads_tr), | |
int(r_trim_AA)/int(nreads_tr)), file = finalout_handle) | |
finalout_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment