conchoecia · April 13, 2018 13:24
diff --git a/MiSeqLibraryQC(Snakefile) b/MiSeqLibraryQC(Snakefile)
 import os
 from pathlib import Path
 import yaml

 configfile: "directories.yaml"

 # The yaml file should look like this below. Just an object called "directories"
 #  and a list of directories in which the pipeline should look for things.
 #
 #  directories:
 #    - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/"
 #    - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/"
 #    - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/"

 #   vvvv  LOOK HERE vvv
 #
 #  your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
 #   goes here
 trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
 ntpath = "/data/ncbi/db/nt"
 #
 #
 #   ^^^^  LOOK HERE ^^^

 def is_fastq(filepath):
    suffix_set = set(Path(filepath).suffixes)
    fastqs = set([".fastq", ".fq"])
    fastq_intersection = suffix_set & fastqs
    if len(fastq_intersection) > 0:
        return True
    else:
        return False

 #now we must populate the sample list
 config["samples"] = {}
 for this_d in config["directories"]:
    these_files = os.listdir(this_d)
    fastqs = [filepath for filepath in these_files if is_fastq(filepath)]
    samplelist = set()
    for filepath in fastqs:
        samplelist.add(filepath.split("_")[0])
    for sample in samplelist:
        sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath])
        f = os.path.join(this_d, sample_fastqs[0])
        r = os.path.join(this_d, sample_fastqs[1])
        if sample in config["samples"]:
            raise IOError("""Library {} was found in more than two directories.
          Please only include one set of readpairs per library:
           - {}
           - {}""".format(sample, this_d, config["samples"][sample]["f"]))
        else:
            config["samples"][sample] = {}
            config["samples"][sample]["f"] = f
            config["samples"][sample]["r"] = r

 rule all:
    input:
        expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]),
        expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]),
        expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
        expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]),
        expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]),
        expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]),
        expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]),
        expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]),
        expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), 
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand(      "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]),
        expand(      "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]),
        expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]),
        expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]),
        "report/final_report.txt"


 rule make_fake_reads:
    input:
        [config["samples"][sample][readdir] \
           for readdir in ['f', 'r'] \
           for sample in config["samples"]]
    output:
        forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])),
        reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"]))
    message:
        "making soft links"
    run:
        for this_sample in sorted(config["samples"]):
            for this_direction in ["f", "r"]:
                os.symlink(config["samples"][this_sample][this_direction],
                           "reads/{}_{}.fastq.gz".format(this_sample, this_direction))

 rule trim_pairs:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
    output:
        f_paired =   "trimmed/{sample}_f.trim.fastq.gz",
        r_paired =   "trimmed/{sample}_r.trim.fastq.gz",
        f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"),
        r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"),
        trimlog = "trimmed/{sample}_trimlog.txt"
    threads:
        60
    message:
        "adapter and quality trimming files"
    shell:
        """java -jar {input.trim_jar} PE \
        -phred33 -threads {threads} \
        -trimlog {output.trimlog}\
        {input.f1} {input.f2} \
        {output.f_paired} \
        {output.f_unpaired} \
        {output.r_paired} \
        {output.r_unpaired} \
        ILLUMINACLIP:{input.adapter_path}:2:30:10 \
        LEADING:3 TRAILING:3 \
        SLIDINGWINDOW:4:15 MINLEN:36"""

 rule num_reads:
    input:
        forward_trim =   "trimmed/{sample}_f.trim.fastq.gz",
        f1 = "reads/{sample}_f.fastq.gz"
    output:
        numreads = "report/numreads/{sample}.numreads.txt",
        trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
    shell:
        """echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \
        echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}"""

 rule library_efficiency:
    input:
        numreads = "report/numreads/{sample}.numreads.txt",
        trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
    output:
        "report/efficiency/{sample}.efficiency.txt"
    message:
        "calculating the library efficiency"
    shell:
        """cat {input.trimmed} {input.numreads} | \
        paste - - | \
        awk '{{print $1/$2}}' > {output} """


 rule fastq_to_fasta_fwd:
    input:
        forward =   "trimmed/{sample}_f.trim.fastq.gz"
    output:
        temp("trimmed/{sample}_f.fasta")
    message:
        "Converting fastq forward file to fasta"
    shell:
        "seqtk seq -A {input.forward} > {output}"

 rule sample_fasta:
    input:
        forward =  "trimmed/{sample}_f.fasta"
    output:
        temp("trimmed/{sample}_f.subsample.fasta")
    message:
        "Sampling up to 100k reads from the fasta"
    shell:
        """NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\
        if [ $NUMREADS  > 100000 ]; then\
               echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \
               seqtk sample -s100 {input.forward} 100000 > {output};\
           else\
               echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\
               cp {input.forward} {output};\
           fi
        """

 rule blast_sample:
    input:
        forward =  "trimmed/{sample}_f.subsample.fasta"
    output:
        "report/blast/{sample}_f.blast.txt"
    message:
        "BLAST the reads"
    params:
        pntpath = ntpath
    threads:
        96
    shell:
        """blastn -query {input.forward} \
        -db {params.pntpath} -outfmt 6 \
        -out {output} -num_threads {threads}"""

 rule fastqc_raw:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
    output:
        "fastqc/raw/{sample}_f_fastqc.html",
        "fastqc/raw/{sample}_f_fastqc.zip",
        "fastqc/raw/{sample}_r_fastqc.html",
        "fastqc/raw/{sample}_r_fastqc.zip"
    message:
        "making a fastqc report of the raw reads"
    params:
        raw_dir = "fastqc/raw/"
    shell:
        """fastqc -a {input.adapter_path} \
        -o {params.raw_dir} \
        {input.f1} {input.f2}"""

 rule fastqc_trimmed:
    input:
        f1 =   "trimmed/{sample}_f.trim.fastq.gz",
        f2 =   "trimmed/{sample}_r.trim.fastq.gz",
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
    output:
        "fastqc/trimmed/{sample}_f.trim_fastqc.html",
        "fastqc/trimmed/{sample}_f.trim_fastqc.zip",
        "fastqc/trimmed/{sample}_r.trim_fastqc.html",
        "fastqc/trimmed/{sample}_r.trim_fastqc.zip"
    message:
        "making a fastqc report of the trimmed reads"
    params:
        raw_dir = "fastqc/trimmed/"
    shell:
        """fastqc -a {input.adapter_path} \
        -o {params.raw_dir} \
        {input.f1} {input.f2}"""

 rule GATCGATC_f:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        f1t =   "trimmed/{sample}_f.trim.fastq.gz",
        f2t =   "trimmed/{sample}_r.trim.fastq.gz"
    output:
        f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count",
        f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count",
        f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count",
        f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count"
    shell:
        "set +o pipefail; "
        "bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC'  - | wc -l > {output.f1o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC'  - | wc -l > {output.f2o}; " 
        "bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}"

 rule AATTAATT_num:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        f1t =   "trimmed/{sample}_f.trim.fastq.gz",
        f2t =   "trimmed/{sample}_r.trim.fastq.gz"
    output:
        f1o =       "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count",
        f2o =       "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count",
        f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count",
        f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count"

    shell:
        "set +o pipefail; " 
        "bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT'  - | wc -l > {output.f1o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT'  - | wc -l > {output.f2o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}"

 def read_number_from_file(filename):
    with open(filename, "r") as f:
       for line in f:
           if line.strip():
               return line.strip()

 rule collate_report:
    """ this method prints out the final qc report of all the libraries."""
    input:
        raw_num =       expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]),
        trim_num =      expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]),
        efficiency =    expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
        f1dpn =         expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]),
        f2dpn =         expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]),
        f1mluc =        expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]),
        f2mluc =        expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]),
        f1tdpn =        expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]),
        f2tdpn =        expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]),
        f1tmluc =       expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]),
        f2tmluc =       expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"])

    output:
        "report/final_report.txt"
    run:
        finalout_handle = open(output[0], "w")
        #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
                        "libname",
                        "num_reads",
                        "num_reads_trimmed",
                        "trim_efficiency",
                        "f_GATCGATC",
                        "r_GATCGATC",
                        "f_AATTAATT",
                        "r_AATTAATT",
                        "f_trim_GATCGATC",
                        "r_trim_GATCGATC",
                        "f_trim_AATTAATT",
                        "r_trim_AATTAATT"),
              file = finalout_handle)
        for thiss in sorted(config["samples"]):
            num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss))
            nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss))
            trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss))
            f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss))
            r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss))
            f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss))
            r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss))
            f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss))
            r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss))
            f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss))
            r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss))
            #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
                    thiss,
                    num_reads,
                    nreads_tr,
                    trim_effi,
                    int(f_GATCGAT)/int(num_reads),
                    int(r_GATCGAT)/int(num_reads),
                    int(f_AATTAAT)/int(num_reads),
                    int(r_AATTAAT)/int(num_reads),
                    int(f_trim_GA)/int(nreads_tr),
                    int(r_trim_GA)/int(nreads_tr),
                    int(f_trim_AA)/int(nreads_tr),
                    int(r_trim_AA)/int(nreads_tr)), file = finalout_handle)
        finalout_handle.close()
diff --git a/Snakefile b/Snakefile
 import os
 from pathlib import Path
 import yaml

 configfile: "directories.yaml"

 # The yaml file should look like this below. Just an object called "directories"
 #  and a list of directories in which the pipeline should look for things.
 #
 #  directories:
 #    - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/"
 #    - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/"
 #    - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/"

 #   vvvv  LOOK HERE vvv
 #
 #  your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
 #   goes here
 trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
 ntpath = "/data/ncbi/db/nt"
 #
 #
 #   ^^^^  LOOK HERE ^^^

 def is_fastq(filepath):
    suffix_set = set(Path(filepath).suffixes)
    fastqs = set([".fastq", ".fq"])
    fastq_intersection = suffix_set & fastqs
    if len(fastq_intersection) > 0:
        return True
    else:
        return False

 #now we must populate the sample list
 config["samples"] = {}
 for this_d in config["directories"]:
    these_files = os.listdir(this_d)
    fastqs = [filepath for filepath in these_files if is_fastq(filepath)]
    samplelist = set()
    for filepath in fastqs:
        samplelist.add(filepath.split("_")[0])
    for sample in samplelist:
        sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath])
        f = os.path.join(this_d, sample_fastqs[0])
        r = os.path.join(this_d, sample_fastqs[1])
        if sample in config["samples"]:
            raise IOError("""Library {} was found in more than two directories.
          Please only include one set of readpairs per library:
           - {}
           - {}""".format(sample, this_d, config["samples"][sample]["f"]))
        else:
            config["samples"][sample] = {}
            config["samples"][sample]["f"] = f
            config["samples"][sample]["r"] = r

 rule all:
    input:
        expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]),
        expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]),
        expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
        expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]),
        expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]),
        expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]),
        expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]),
        expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]),
        expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]),
        expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]), 
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
        expand(      "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]),
        expand(      "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]),
        expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]),
        expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]),
        "report/final_report.txt"


 rule make_fake_reads:
    input:
        [config["samples"][sample][readdir] \
           for readdir in ['f', 'r'] \
           for sample in config["samples"]]
    output:
        forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])),
        reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"]))
    message:
        "making soft links"
    run:
        for this_sample in sorted(config["samples"]):
            for this_direction in ["f", "r"]:
                os.symlink(config["samples"][this_sample][this_direction],
                           "reads/{}_{}.fastq.gz".format(this_sample, this_direction))

 rule trim_pairs:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
    output:
        f_paired =   "trimmed/{sample}_f.trim.fastq.gz",
        r_paired =   "trimmed/{sample}_r.trim.fastq.gz",
        f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"),
        r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"),
        trimlog = "trimmed/{sample}_trimlog.txt"
    threads:
        60
    message:
        "adapter and quality trimming files"
    shell:
        """java -jar {input.trim_jar} PE \
        -phred33 -threads {threads} \
        -trimlog {output.trimlog}\
        {input.f1} {input.f2} \
        {output.f_paired} \
        {output.f_unpaired} \
        {output.r_paired} \
        {output.r_unpaired} \
        ILLUMINACLIP:{input.adapter_path}:2:30:10 \
        LEADING:3 TRAILING:3 \
        SLIDINGWINDOW:4:15 MINLEN:36"""

 rule num_reads:
    input:
        forward_trim =   "trimmed/{sample}_f.trim.fastq.gz",
        f1 = "reads/{sample}_f.fastq.gz"
    output:
        numreads = "report/numreads/{sample}.numreads.txt",
        trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
    shell:
        """echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \
        echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}"""

 rule library_efficiency:
    input:
        numreads = "report/numreads/{sample}.numreads.txt",
        trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
    output:
        "report/efficiency/{sample}.efficiency.txt"
    message:
        "calculating the library efficiency"
    shell:
        """cat {input.trimmed} {input.numreads} | \
        paste - - | \
        awk '{{print $1/$2}}' > {output} """


 rule fastq_to_fasta_fwd:
    input:
        forward =   "trimmed/{sample}_f.trim.fastq.gz"
    output:
        temp("trimmed/{sample}_f.fasta")
    message:
        "Converting fastq forward file to fasta"
    shell:
        "seqtk seq -A {input.forward} > {output}"

 rule sample_fasta:
    input:
        forward =  "trimmed/{sample}_f.fasta"
    output:
        temp("trimmed/{sample}_f.subsample.fasta")
    message:
        "Sampling up to 100k reads from the fasta"
    shell:
        """NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\
        if [ $NUMREADS  > 100000 ]; then\
               echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \
               seqtk sample -s100 {input.forward} 100000 > {output};\
           else\
               echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\
               cp {input.forward} {output};\
           fi
        """

 rule blast_sample:
    input:
        forward =  "trimmed/{sample}_f.subsample.fasta"
    output:
        "report/blast/{sample}_f.blast.txt"
    message:
        "BLAST the reads"
    params:
        pntpath = ntpath
    threads:
        96
    shell:
        """blastn -query {input.forward} \
        -db {params.pntpath} -outfmt 6 \
        -out {output} -num_threads {threads}"""

 rule fastqc_raw:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
    output:
        "fastqc/raw/{sample}_f_fastqc.html",
        "fastqc/raw/{sample}_f_fastqc.zip",
        "fastqc/raw/{sample}_r_fastqc.html",
        "fastqc/raw/{sample}_r_fastqc.zip"
    message:
        "making a fastqc report of the raw reads"
    params:
        raw_dir = "fastqc/raw/"
    shell:
        """fastqc -a {input.adapter_path} \
        -o {params.raw_dir} \
        {input.f1} {input.f2}"""

 rule fastqc_trimmed:
    input:
        f1 =   "trimmed/{sample}_f.trim.fastq.gz",
        f2 =   "trimmed/{sample}_r.trim.fastq.gz",
        adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
    output:
        "fastqc/trimmed/{sample}_f.trim_fastqc.html",
        "fastqc/trimmed/{sample}_f.trim_fastqc.zip",
        "fastqc/trimmed/{sample}_r.trim_fastqc.html",
        "fastqc/trimmed/{sample}_r.trim_fastqc.zip"
    message:
        "making a fastqc report of the trimmed reads"
    params:
        raw_dir = "fastqc/trimmed/"
    shell:
        """fastqc -a {input.adapter_path} \
        -o {params.raw_dir} \
        {input.f1} {input.f2}"""

 rule GATCGATC_f:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        f1t =   "trimmed/{sample}_f.trim.fastq.gz",
        f2t =   "trimmed/{sample}_r.trim.fastq.gz"
    output:
        f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count",
        f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count",
        f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count",
        f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count"
    shell:
        "set +o pipefail; "
        "bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'GATCGATC'  - | wc -l > {output.f1o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'GATCGATC'  - | wc -l > {output.f2o}; " 
        "bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'GATCGATC' - | wc -l > {output.f1to}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'GATCGATC' - | wc -l > {output.f2to}"

 rule AATTAATT_num:
    input:
        f1 = "reads/{sample}_f.fastq.gz",
        f2 = "reads/{sample}_r.fastq.gz",
        f1t =   "trimmed/{sample}_f.trim.fastq.gz",
        f2t =   "trimmed/{sample}_r.trim.fastq.gz"
    output:
        f1o =       "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count",
        f2o =       "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count",
        f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count",
        f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count"

    shell:
        "set +o pipefail; " 
        "bioawk -cfastx '{{print $seq}}' {input.f1} | grep 'AATTAATT'  - | wc -l > {output.f1o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2} | grep 'AATTAATT'  - | wc -l > {output.f2o}; "
        "bioawk -cfastx '{{print $seq}}' {input.f1t} | grep 'AATTAATT' - | wc -l > {output.f1to}; "
        "bioawk -cfastx '{{print $seq}}' {input.f2t} | grep 'AATTAATT' - | wc -l > {output.f2to}"

 def read_number_from_file(filename):
    with open(filename, "r") as f:
       for line in f:
           if line.strip():
               return line.strip()

 rule collate_report:
    """ this method prints out the final qc report of all the libraries."""
    input:
        raw_num =       expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]),
        trim_num =      expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]),
        efficiency =    expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
        f1dpn =         expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]),
        f2dpn =         expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]),
        f1mluc =        expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]),
        f2mluc =        expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]),
        f1tdpn =        expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]),
        f2tdpn =        expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]),
        f1tmluc =       expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]),
        f2tmluc =       expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"])

    output:
        "report/final_report.txt"
    run:
        finalout_handle = open(output[0], "w")
        #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
                        "libname",
                        "num_reads",
                        "num_reads_trimmed",
                        "trim_efficiency",
                        "f_GATCGATC",
                        "r_GATCGATC",
                        "f_AATTAATT",
                        "r_AATTAATT",
                        "f_trim_GATCGATC",
                        "r_trim_GATCGATC",
                        "f_trim_AATTAATT",
                        "r_trim_AATTAATT"),
              file = finalout_handle)
        for thiss in sorted(config["samples"]):
            num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss))
            nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss))
            trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss))
            f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss))
            r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss))
            f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss))
            r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss))
            f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss))
            r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss))
            f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss))
            r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss))
            #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
                    thiss,
                    num_reads,
                    nreads_tr,
                    trim_effi,
                    int(f_GATCGAT)/int(num_reads),
                    int(r_GATCGAT)/int(num_reads),
                    int(f_AATTAAT)/int(num_reads),
                    int(r_AATTAAT)/int(num_reads),
                    int(f_trim_GA)/int(nreads_tr),
                    int(r_trim_GA)/int(nreads_tr),
                    int(f_trim_AA)/int(nreads_tr),
                    int(r_trim_AA)/int(nreads_tr)), file = finalout_handle)
        finalout_handle.close()
	import os
	from pathlib import Path
	import yaml

	configfile: "directories.yaml"

	# The yaml file should look like this below. Just an object called "directories"
	# and a list of directories in which the pipeline should look for things.
	#
	# directories:
	# - "/data/raw/genomic_reads/180315_M00160_0073_000000000-BNBV5/"
	# - "/data/raw/genomic_reads/180123_M00160_0067_000000000-BKDV3/"
	# - "/data/raw/genomic_reads/171018_M00160_0050_000000000-BFNHV/"

	# vvvv LOOK HERE vvv
	#
	# your path to the directory containing trimmomatic-0.35.jar and adapters/TruSeq3-PE-2.fa
	# goes here
	trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
	ntpath = "/data/ncbi/db/nt"
	#
	#
	# ^^^^ LOOK HERE ^^^

	def is_fastq(filepath):
	suffix_set = set(Path(filepath).suffixes)
	fastqs = set([".fastq", ".fq"])
	fastq_intersection = suffix_set & fastqs
	if len(fastq_intersection) > 0:
	return True
	else:
	return False

	#now we must populate the sample list
	config["samples"] = {}
	for this_d in config["directories"]:
	these_files = os.listdir(this_d)
	fastqs = [filepath for filepath in these_files if is_fastq(filepath)]
	samplelist = set()
	for filepath in fastqs:
	samplelist.add(filepath.split("_")[0])
	for sample in samplelist:
	sample_fastqs = sorted([filepath for filepath in fastqs if sample in filepath])
	f = os.path.join(this_d, sample_fastqs[0])
	r = os.path.join(this_d, sample_fastqs[1])
	if sample in config["samples"]:
	raise IOError("""Library {} was found in more than two directories.
	Please only include one set of readpairs per library:
	- {}
	- {}""".format(sample, this_d, config["samples"][sample]["f"]))
	else:
	config["samples"][sample] = {}
	config["samples"][sample]["f"] = f
	config["samples"][sample]["r"] = r

	rule all:
	input:
	expand("trimmed/{sample}_trimlog.txt", sample=config["samples"]),
	expand("report/blast/{sample}_f.blast.txt", sample= config["samples"]),
	expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
	expand("fastqc/raw/{sample}_f_fastqc.html", sample = config["samples"]),
	expand("fastqc/raw/{sample}_f_fastqc.zip", sample = config["samples"]),
	expand("fastqc/raw/{sample}_r_fastqc.html", sample = config["samples"]),
	expand("fastqc/raw/{sample}_r_fastqc.zip", sample = config["samples"]),
	expand("fastqc/trimmed/{sample}_f.trim_fastqc.html", sample = config["samples"]),
	expand("fastqc/trimmed/{sample}_f.trim_fastqc.zip", sample = config["samples"]),
	expand("fastqc/trimmed/{sample}_r.trim_fastqc.html", sample = config["samples"]),
	expand("fastqc/trimmed/{sample}_r.trim_fastqc.zip", sample = config["samples"]),
	expand("report/numreads/{sample}.numreads.txt",sample = config["samples"]),
	expand("report/numreads/{sample}.trimmed.numreads.txt", sample = config["samples"]),
	expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
	expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
	expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
	expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample = config["samples"]),
	expand( "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample = config["samples"]),
	expand( "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample = config["samples"]),
	expand( "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample = config["samples"]),
	expand( "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample = config["samples"]),
	"report/final_report.txt"


	rule make_fake_reads:
	input:
	[config["samples"][sample][readdir] \
	for readdir in ['f', 'r'] \
	for sample in config["samples"]]
	output:
	forward = sorted(expand("reads/{sample}_f.fastq.gz", sample=config["samples"])),
	reverse = sorted(expand("reads/{sample}_r.fastq.gz", sample=config["samples"]))
	message:
	"making soft links"
	run:
	for this_sample in sorted(config["samples"]):
	for this_direction in ["f", "r"]:
	os.symlink(config["samples"][this_sample][this_direction],
	"reads/{}_{}.fastq.gz".format(this_sample, this_direction))

	rule trim_pairs:
	input:
	f1 = "reads/{sample}_f.fastq.gz",
	f2 = "reads/{sample}_r.fastq.gz",
	trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
	adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
	output:
	f_paired = "trimmed/{sample}_f.trim.fastq.gz",
	r_paired = "trimmed/{sample}_r.trim.fastq.gz",
	f_unpaired = temp("trimmed/{sample}_1.trim.unpaired.fastq.gz"),
	r_unpaired = temp("trimmed/{sample}_2.trim.unpaired.fastq.gz"),
	trimlog = "trimmed/{sample}_trimlog.txt"
	threads:
	60
	message:
	"adapter and quality trimming files"
	shell:
	"""java -jar {input.trim_jar} PE \
	-phred33 -threads {threads} \
	-trimlog {output.trimlog}\
	{input.f1} {input.f2} \
	{output.f_paired} \
	{output.f_unpaired} \
	{output.r_paired} \
	{output.r_unpaired} \
	ILLUMINACLIP:{input.adapter_path}:2:30:10 \
	LEADING:3 TRAILING:3 \
	SLIDINGWINDOW:4:15 MINLEN:36"""

	rule num_reads:
	input:
	forward_trim = "trimmed/{sample}_f.trim.fastq.gz",
	f1 = "reads/{sample}_f.fastq.gz"
	output:
	numreads = "report/numreads/{sample}.numreads.txt",
	trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
	shell:
	"""echo $(bioawk -cfastx 'END{{print NR}}' {input.forward_trim}) > {output.trimmed}; \
	echo $(bioawk -cfastx 'END{{print NR}}' {input.f1}) > {output.numreads}"""

	rule library_efficiency:
	input:
	numreads = "report/numreads/{sample}.numreads.txt",
	trimmed = "report/numreads/{sample}.trimmed.numreads.txt"
	output:
	"report/efficiency/{sample}.efficiency.txt"
	message:
	"calculating the library efficiency"
	shell:
	"""cat {input.trimmed} {input.numreads} \| \
	paste - - \| \
	awk '{{print $1/$2}}' > {output} """


	rule fastq_to_fasta_fwd:
	input:
	forward = "trimmed/{sample}_f.trim.fastq.gz"
	output:
	temp("trimmed/{sample}_f.fasta")
	message:
	"Converting fastq forward file to fasta"
	shell:
	"seqtk seq -A {input.forward} > {output}"

	rule sample_fasta:
	input:
	forward = "trimmed/{sample}_f.fasta"
	output:
	temp("trimmed/{sample}_f.subsample.fasta")
	message:
	"Sampling up to 100k reads from the fasta"
	shell:
	"""NUMREADS=$(bioawk -cfastx 'END{{print NR}}' {input.forward});\
	if [ $NUMREADS > 100000 ]; then\
	echo '{input.forward} has '"$NUMREADS"' reads. Sampling 100k.'; \
	seqtk sample -s100 {input.forward} 100000 > {output};\
	else\
	echo '{input.forward} has '"$NUMREADS"' reads. Not sampling.';\
	cp {input.forward} {output};\
	fi
	"""

	rule blast_sample:
	input:
	forward = "trimmed/{sample}_f.subsample.fasta"
	output:
	"report/blast/{sample}_f.blast.txt"
	message:
	"BLAST the reads"
	params:
	pntpath = ntpath
	threads:
	96
	shell:
	"""blastn -query {input.forward} \
	-db {params.pntpath} -outfmt 6 \
	-out {output} -num_threads {threads}"""

	rule fastqc_raw:
	input:
	f1 = "reads/{sample}_f.fastq.gz",
	f2 = "reads/{sample}_r.fastq.gz",
	adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
	output:
	"fastqc/raw/{sample}_f_fastqc.html",
	"fastqc/raw/{sample}_f_fastqc.zip",
	"fastqc/raw/{sample}_r_fastqc.html",
	"fastqc/raw/{sample}_r_fastqc.zip"
	message:
	"making a fastqc report of the raw reads"
	params:
	raw_dir = "fastqc/raw/"
	shell:
	"""fastqc -a {input.adapter_path} \
	-o {params.raw_dir} \
	{input.f1} {input.f2}"""

	rule fastqc_trimmed:
	input:
	f1 = "trimmed/{sample}_f.trim.fastq.gz",
	f2 = "trimmed/{sample}_r.trim.fastq.gz",
	adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2-fastqc.fa")
	output:
	"fastqc/trimmed/{sample}_f.trim_fastqc.html",
	"fastqc/trimmed/{sample}_f.trim_fastqc.zip",
	"fastqc/trimmed/{sample}_r.trim_fastqc.html",
	"fastqc/trimmed/{sample}_r.trim_fastqc.zip"
	message:
	"making a fastqc report of the trimmed reads"
	params:
	raw_dir = "fastqc/trimmed/"
	shell:
	"""fastqc -a {input.adapter_path} \
	-o {params.raw_dir} \
	{input.f1} {input.f2}"""

	rule GATCGATC_f:
	input:
	f1 = "reads/{sample}_f.fastq.gz",
	f2 = "reads/{sample}_r.fastq.gz",
	f1t = "trimmed/{sample}_f.trim.fastq.gz",
	f2t = "trimmed/{sample}_r.trim.fastq.gz"
	output:
	f1o = "report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count",
	f2o = "report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count",
	f1to = "report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count",
	f2to = "report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count"
	shell:
	"set +o pipefail; "
	"bioawk -cfastx '{{print $seq}}' {input.f1} \| grep 'GATCGATC' - \| wc -l > {output.f1o}; "
	"bioawk -cfastx '{{print $seq}}' {input.f2} \| grep 'GATCGATC' - \| wc -l > {output.f2o}; "
	"bioawk -cfastx '{{print $seq}}' {input.f1t} \| grep 'GATCGATC' - \| wc -l > {output.f1to}; "
	"bioawk -cfastx '{{print $seq}}' {input.f2t} \| grep 'GATCGATC' - \| wc -l > {output.f2to}"

	rule AATTAATT_num:
	input:
	f1 = "reads/{sample}_f.fastq.gz",
	f2 = "reads/{sample}_r.fastq.gz",
	f1t = "trimmed/{sample}_f.trim.fastq.gz",
	f2t = "trimmed/{sample}_r.trim.fastq.gz"
	output:
	f1o = "report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count",
	f2o = "report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count",
	f1to = "report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count",
	f2to = "report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count"

	shell:
	"set +o pipefail; "
	"bioawk -cfastx '{{print $seq}}' {input.f1} \| grep 'AATTAATT' - \| wc -l > {output.f1o}; "
	"bioawk -cfastx '{{print $seq}}' {input.f2} \| grep 'AATTAATT' - \| wc -l > {output.f2o}; "
	"bioawk -cfastx '{{print $seq}}' {input.f1t} \| grep 'AATTAATT' - \| wc -l > {output.f1to}; "
	"bioawk -cfastx '{{print $seq}}' {input.f2t} \| grep 'AATTAATT' - \| wc -l > {output.f2to}"

	def read_number_from_file(filename):
	with open(filename, "r") as f:
	for line in f:
	if line.strip():
	return line.strip()

	rule collate_report:
	""" this method prints out the final qc report of all the libraries."""
	input:
	raw_num = expand("report/numreads/{sample}.numreads.txt", sample=config["samples"]),
	trim_num = expand("report/numreads/{sample}.trimmed.numreads.txt", sample=config["samples"]),
	efficiency = expand("report/efficiency/{sample}.efficiency.txt", sample=config["samples"]),
	f1dpn = expand("report/linkercontent/GATCGATC/{sample}_f.GATCGATC.count", sample=config["samples"]),
	f2dpn = expand("report/linkercontent/GATCGATC/{sample}_r.GATCGATC.count", sample=config["samples"]),
	f1mluc = expand("report/linkercontent/AATTAATT/{sample}_f.AATTAATT.count", sample=config["samples"]),
	f2mluc = expand("report/linkercontent/AATTAATT/{sample}_r.AATTAATT.count", sample=config["samples"]),
	f1tdpn = expand("report/linkercontent/GATCGATC/{sample}_f.trim.GATCGATC.count", sample=config["samples"]),
	f2tdpn = expand("report/linkercontent/GATCGATC/{sample}_r.trim.GATCGATC.count", sample=config["samples"]),
	f1tmluc = expand("report/linkercontent/AATTAATT/{sample}_f.trim.AATTAATT.count", sample=config["samples"]),
	f2tmluc = expand("report/linkercontent/AATTAATT/{sample}_r.trim.AATTAATT.count", sample=config["samples"])

	output:
	"report/final_report.txt"
	run:
	finalout_handle = open(output[0], "w")
	#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
	print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
	"libname",
	"num_reads",
	"num_reads_trimmed",
	"trim_efficiency",
	"f_GATCGATC",
	"r_GATCGATC",
	"f_AATTAATT",
	"r_AATTAATT",
	"f_trim_GATCGATC",
	"r_trim_GATCGATC",
	"f_trim_AATTAATT",
	"r_trim_AATTAATT"),
	file = finalout_handle)
	for thiss in sorted(config["samples"]):
	num_reads = read_number_from_file("report/numreads/{}.numreads.txt".format(thiss))
	nreads_tr = read_number_from_file("report/numreads/{}.trimmed.numreads.txt".format(thiss))
	trim_effi = read_number_from_file("report/efficiency/{}.efficiency.txt".format(thiss))
	f_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_f.GATCGATC.count".format(thiss))
	r_GATCGAT = read_number_from_file("report/linkercontent/GATCGATC/{}_r.GATCGATC.count".format(thiss))
	f_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_f.AATTAATT.count".format(thiss))
	r_AATTAAT = read_number_from_file("report/linkercontent/AATTAATT/{}_r.AATTAATT.count".format(thiss))
	f_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_f.trim.GATCGATC.count".format(thiss))
	r_trim_GA = read_number_from_file("report/linkercontent/GATCGATC/{}_r.trim.GATCGATC.count".format(thiss))
	f_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_f.trim.AATTAATT.count".format(thiss))
	r_trim_AA = read_number_from_file("report/linkercontent/AATTAATT/{}_r.trim.AATTAATT.count".format(thiss))
	#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
	print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
	thiss,
	num_reads,
	nreads_tr,
	trim_effi,
	int(f_GATCGAT)/int(num_reads),
	int(r_GATCGAT)/int(num_reads),
	int(f_AATTAAT)/int(num_reads),
	int(r_AATTAAT)/int(num_reads),
	int(f_trim_GA)/int(nreads_tr),
	int(r_trim_GA)/int(nreads_tr),
	int(f_trim_AA)/int(nreads_tr),
	int(r_trim_AA)/int(nreads_tr)), file = finalout_handle)
	finalout_handle.close()