Last active
April 28, 2018 00:24
-
-
Save conchoecia/9413c1306387a4e58d322920c5118576 to your computer and use it in GitHub Desktop.
map reads to a locus and reassemble
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The goal of this script is to map reads to a region and to assemble it de novo. | |
# This is useful for scenarios in which there are gaps between known homologous sequence, | |
# like what one might encounter when doing a de novo mitochondrial genome assembly | |
# or assembling a gene locus from a transcript. | |
# | |
#The steps for this assembly process are. | |
# 1) Map all of the reads to the scaffold | |
# 2) Extract all of the read pairs from the original fastq files and make new fastqs. | |
# 3) Use the new Fastq files in a de novo Spades assembly. | |
# The output of this is fed back into step 1 | |
# I found the idea for the iterative use of snakemake here: | |
# https://groups.google.com/forum/#!searchin/Snakemake/dirty$20trick$20loop%7Csort:date/snakemake/Qb3sBXuIriQ/Y9jdconuuxMJ | |
configfile: "config.yaml" | |
PARAM = 10 | |
rule all: | |
input: | |
"analysis/done_mail.txt" | |
rule done: | |
""" | |
email the haddock lab slack that the job is done. | |
""" | |
input: | |
"analysis/round{param}/round{param}.sorted.bam.bai".format(param=PARAM-1) | |
output: | |
"analysis/done_mail.txt" | |
shell: | |
"""mail -s "DTS Erenna Locus Assembler done" -t [email protected] <<< "Finished" && \ | |
touch {output}""" | |
rule initial_copy: | |
"""This copies the initial fasta file to the starting directory and formats | |
it as a multiline fasta file to ensure that bwa index works properly. | |
""" | |
input: | |
config["files"]["fasta"] | |
output: | |
"analysis/round0/round0.fasta" | |
shell: | |
"""bioawk -cfastx '{{system("printf \\">%s\\"" $name " >> {output}"); \ | |
system("echo '' >> {output}"); \ | |
system("echo " $seq " | fold -w 60 - >> {output}") }}' {input}""" | |
for i in range(PARAM): | |
print("in round {}".format(i)) | |
rule: | |
# 1) Map all of the reads to the scaffold | |
""" | |
Index the fasta file for mapping. | |
""" | |
input: | |
"analysis/round{param}/round{param}.fasta".format(param=i) | |
output: | |
"analysis/round{param}/round{param}.fasta.amb".format(param=i), | |
"analysis/round{param}/round{param}.fasta.ann".format(param=i), | |
"analysis/round{param}/round{param}.fasta.bwt".format(param=i), | |
"analysis/round{param}/round{param}.fasta.pac".format(param=i), | |
"analysis/round{param}/round{param}.fasta.sa".format(param=i) | |
shell: | |
"bwa index {input}" | |
rule: | |
""" | |
Map the reads | |
""" | |
input: | |
"analysis/round{param}/round{param}.fasta.amb".format(param=i), | |
"analysis/round{param}/round{param}.fasta.ann".format(param=i), | |
"analysis/round{param}/round{param}.fasta.bwt".format(param=i), | |
"analysis/round{param}/round{param}.fasta.pac".format(param=i), | |
"analysis/round{param}/round{param}.fasta.sa".format(param=i), | |
this_fasta = "analysis/round{param}/round{param}.fasta".format(param=i), | |
forward = config["files"]["forward"], | |
reverse = config["files"]["reverse"] | |
output: | |
bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i) | |
threads: | |
90 | |
shell: | |
"""bwa mem -t {threads} {input.this_fasta} {input.forward} {input.reverse} | \ | |
samtools view -bh -G 12 -@ {threads} - | \ | |
samtools sort -@ {threads} - > {output.bam}""" | |
rule: | |
""" | |
index the sorted bam file to view in IGV | |
""" | |
input: | |
bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i) | |
output: | |
bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i) | |
shell: | |
"samtools index {input}" | |
rule: | |
""" | |
make fastq files using the bam file | |
""" | |
input: | |
bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i), | |
bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i) | |
output: | |
fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i), | |
fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i), | |
fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i) | |
shell: | |
"samtools fastq {input.bam} -1 {output.fq_f} -2 {output.fq_r} -N -s {output.fq_s}" | |
rule: | |
""" | |
assemble the genome region with the reads in hand | |
""" | |
input: | |
fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i), | |
fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i), | |
fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i) | |
output: | |
outdir = "analysis/round{param}/spades/".format(param=i), | |
contigs = "analysis/round{param}/spades/contigs.fasta".format(param=i), | |
scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i) | |
threads: | |
90 | |
shell: | |
"""spades.py -t {threads} -m 500 -k 55,101 \ | |
-o {output.outdir} \ | |
-1 {input.fq_f} \ | |
-2 {input.fq_r} \ | |
-s {input.fq_s} | |
""" | |
rule: | |
""" | |
This step joins the seed fasta file with the scaffolds from the spades | |
assembly into the new seed for the next round | |
""" | |
input: | |
scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i), | |
og_fasta = "analysis/round0/round0.fasta" | |
output: | |
new_seed = "analysis/round{param}/round{param}.fasta".format(param = i+1) | |
shell: | |
"""echo {input.scaffolds} >> {output.new_seed} &&\ | |
echo {input.og_fasta} >> {output.new_seed} | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment