Created
June 6, 2019 06:51
-
-
Save conchoecia/4047de8bad7e78523e3aff9a0a3a6083 to your computer and use it in GitHub Desktop.
transcriptome_assembly_pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
samples: | |
Thalassicolla_SRR7533167: | |
f: "/rna_reads/SRA/SRR7533167_1.fastq.gz" | |
r: "/rna_reads/SRA/SRR7533167_2.fastq.gz" | |
SS_lib_type: "RF" | |
library: "SRR7533167" | |
GLO: "NA" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author: Darrin Schultz @conchoecia | |
File: Transcriptome assembly, annotation, and db creation | |
Instructions: | |
- To run this script and all of its analyses: make sure that python 3, | |
snakemake, and biopython are installed on your Unix computer. | |
- Execute the following command: `snakemake --cores 45`, replacing `45` | |
with the number of threads available on your machine. | |
""" | |
import os | |
import sys | |
from Bio import SeqIO | |
import shutil | |
configfile: "config.yaml" | |
trimmomatic = "/usr/local/bin/Trimmomatic-0.35" | |
maxthreads = 90 | |
dockeruser = "YOUR_USERNAME_HERE" | |
curr_dir = os.getcwd() | |
print(curr_dir) | |
config["rna_f"] = {} | |
config["rna_r"] = {} | |
config["input_reads"] = [] | |
for sample in config["samples"]: | |
config["input_reads"].append(config["samples"][sample]["f"]) | |
config["input_reads"].append(config["samples"][sample]["r"]) | |
# Now we need to change how the SS_lib_type_param is handled | |
for sample in config["samples"]: | |
thisss_lib = config["samples"][sample]["SS_lib_type"] | |
if thisss_lib in ["None", "NA", "none", "no", "nein"]: | |
config["samples"][sample]["SS_lib_type"] = "" | |
elif thisss_lib in ["rf", "RF", "Rf", "rF"]: | |
config["samples"][sample]["SS_lib_type"] = "--SS_lib_type RF" | |
elif thisss_lib in ["fr", "FR", "Fr", "fR"]: | |
config["samples"][sample]["SS_lib_type"] = "--SS_lib_type FR" | |
def read_number_from_file(filename): | |
with open(filename, "r") as f: | |
for line in f: | |
if line.strip(): | |
return line.strip() | |
def singleline_to_multiline(infile, outfile): | |
""" | |
This function takes a fasta file and wraps it to N characters | |
""" | |
out_handle = open(outfile, 'w') #open outfile for writing | |
with open(infile, "rU") as handle: | |
for record in SeqIO.parse(handle, "fasta"): | |
SeqIO.write(record, out_handle, "fasta") | |
out_handle.close() | |
rule all: | |
input: | |
#make the symlinks | |
expand("reads/{sample}_f.fastq.gz", sample = config["samples"]), | |
expand("reads/{sample}_r.fastq.gz", sample = config["samples"]), | |
##trim the reads | |
expand("trimmed/{sample}_{readtype}.trim.fastq.gz", sample = config["samples"], readtype = ["f", "r"]), | |
#now assemble the transcriptomes | |
expand("txomes/raw/trinity_{sample}_raw.fasta", sample = config["samples"]), | |
# make it multi-line | |
expand("txomes/final/{sample}.fasta", sample = config["samples"]), | |
# transdecoder and translate, rename the pep files | |
expand("pepfiles/final/{sample}.pep", sample = config["samples"]), | |
# softlinks for /data/ncbi/db | |
expand("/data/ncbi/db/{sample}.fasta", sample = config["samples"]), | |
expand("/data/ncbi/db/{sample}.pep", sample = config["samples"]), | |
# make the blast databases | |
expand("/data/ncbi/db/{sample}.fasta.nhr", sample = config["samples"]), | |
expand("/data/ncbi/db/{sample}.pep.phr", sample = config["samples"]), | |
expand("/data/ncbi/db/{sample}.dmnd", sample = config["samples"]), | |
# make the report | |
expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]), | |
expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]), | |
expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]), | |
expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]), | |
expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]), | |
expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]), | |
expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]), | |
expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]), | |
expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]), | |
expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]), | |
expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]), | |
expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]), | |
"report/final_report.txt" | |
############################################################### | |
# __ __ ___ __ __ __ __ ___ __ __ _ __ | |
# |__) |__) |__ |__) |__) / \ / ` |__ /__` /__` | |\ | / _` | |
# | | \ |___ | | \ \__/ \__, |___ .__/ .__/ | | \| \__> | |
# | |
# These are the rules for preprocessing the inputs in order | |
# to prepare for the rest of the pipeline. | |
# preprocessing rule 1 | |
rule make_symlink_f: | |
output: | |
illumina_f = "reads/{sample}_f.fastq.gz", | |
run: | |
if not os.path.exists("reads"): | |
print("Making a directory called 'reads'.", file = sys.stderr) | |
os.makedirs("reads") | |
else: | |
print("The 'reads' directory exists already.", file = sys.stderr) | |
#make symlinks for the illumina reads | |
print("Making read symlinks.", file =sys.stderr) | |
print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr) | |
rna_f = "reads/{}_f.fastq.gz".format(wildcards.sample) | |
if not os.path.exists(rna_f): | |
print(" - Making a symlink for {}".format(rna_f), file=sys.stderr) | |
reads_path = config["samples"][wildcards.sample]["f"] | |
if not os.path.exists(reads_path): | |
raise Exception("{} does not exist.".format(reads_path)) | |
os.symlink(reads_path, rna_f) | |
else: | |
print(" - A symlink already exists for {}".format(rna_f), file=sys.stderr) | |
rule make_symlink_r: | |
output: | |
illumina_r = "reads/{sample}_r.fastq.gz", | |
run: | |
if not os.path.exists("reads"): | |
print("Making a directory called 'reads'.", file = sys.stderr) | |
os.makedirs("reads") | |
else: | |
print("The 'reads' directory exists already.", file = sys.stderr) | |
#make symlinks for the illumina reads | |
print("Making read symlinks.", file =sys.stderr) | |
for sample in config["samples"]: | |
print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr) | |
rna_r = "reads/{}_r.fastq.gz".format(wildcards.sample) | |
if not os.path.exists(rna_r): | |
print(" - Making a symlink for {}".format(rna_r), file=sys.stderr) | |
reads_path = config["samples"][wildcards.sample]["r"] | |
if not os.path.exists(reads_path): | |
raise Exception("{} does not exist.".format(reads_path)) | |
os.symlink(reads_path, rna_r) | |
else: | |
print(" - A symlink already exists for {}".format(rna_r), file=sys.stderr) | |
# | |
# end of rules for preprocessing | |
# | |
########################################################### | |
############################################################## | |
# _ ____ _ | |
# (_) / /_ ______ ___ (_)___ ____ _ | |
# / / / / / / / __ `__ \/ / __ \/ __ `/ | |
# / / / / /_/ / / / / / / / / / / /_/ / | |
# /_/_/_/\__,_/_/ /_/ /_/_/_/ /_/\__,_/ | |
# | |
# These are the rules for working with the Illumina reads. | |
# illumina rule 1 | |
rule trim_pairs: | |
input: | |
f = "reads/{sample}_f.fastq.gz", | |
r = "reads/{sample}_r.fastq.gz", | |
trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"), | |
adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa") | |
output: | |
f = "trimmed/{sample}_f.trim.fastq.gz", | |
r = "trimmed/{sample}_r.trim.fastq.gz", | |
u1= temp("trimmed/{sample}_f.trim.unpaired.fastq.gz"), | |
u2= temp("trimmed/{sample}_r.trim.unpaired.fastq.gz") | |
threads: | |
maxthreads | |
shell: | |
"""java -jar {input.trim_jar} PE \ | |
-phred33 -threads {threads} \ | |
{input.f} {input.r} \ | |
{output.f} \ | |
{output.u1} \ | |
{output.r} \ | |
{output.u2} \ | |
ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE \ | |
LEADING:3 TRAILING:3 \ | |
SLIDINGWINDOW:4:15 MINLEN:36""" | |
# illumina rule 2 | |
rule assemble_txome: | |
input: | |
f_paired = "trimmed/{sample}_f.trim.fastq.gz", | |
r_paired = "trimmed/{sample}_r.trim.fastq.gz" | |
output: | |
assemblypath = "txomes/raw/trinity_{sample}_raw.fasta" | |
params: | |
outpath = "txomes/trinity_{sample}", | |
outfasta = "txomes/trinity_{sample}/Trinity.fasta", | |
dockeruser = dockeruser, | |
sslibtype = lambda wildcards: config["samples"][wildcards.sample]["SS_lib_type"] | |
threads: | |
maxthreads | |
shell: | |
""" | |
docker run \ | |
-u $(id -u):$(id -g) --rm \ | |
-v `pwd`:`pwd` \ | |
-v /etc/passwd:/etc/passwd \ | |
trinity_bowtie2.3.4.1 Trinity \ | |
--seqType fq \ | |
--left `pwd`/{input.f_paired} \ | |
--right `pwd`/{input.r_paired} \ | |
--max_memory 100G \ | |
--CPU {threads} \ | |
--trimmomatic \ | |
{params.sslibtype} \ | |
--output `pwd`/{params.outpath}; | |
mv {params.outfasta} {output.assemblypath} | |
rm -rf {params.outpath} | |
""" | |
# illumina rule 3 | |
rule correct_txome_names: | |
input: | |
assem = "txomes/raw/trinity_{sample}_raw.fasta" | |
output: | |
fassem = temp("txomes/singleline/singleline_trinity_{sample}.fasta") | |
params: | |
samplename = "{sample}" | |
shell: | |
""" | |
sed 's/^>TRINITY/>{params.samplename}/' {input.assem} > {output.fassem} | |
""" | |
# illumina rule 4 | |
rule illumina_txome_single_line_to_multiline: | |
""" | |
This turns the single line output of Trinity into a multi-line fasta | |
file that can be indexed and used in something like bwa. | |
""" | |
input: | |
fassem = "txomes/singleline/singleline_trinity_{sample}.fasta" | |
output: | |
assem = "txomes/final/{sample}.fasta" | |
run: | |
singleline_to_multiline(input.fassem, output.assem) | |
# illumina rule 5 | |
rule translate_txome: | |
input: | |
txome = "txomes/final/{sample}.fasta" | |
output: | |
outpeps = temp("{sample}.fasta.transdecoder_dir/longest_orfs.pep") | |
shell: | |
""" | |
TransDecoder.LongOrfs -t {input.txome} | |
""" | |
# illumina rule 5.1 | |
rule trim_transdecoder_names: | |
""" | |
This trims transdecoder names from an extended string like: | |
>CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1 type:internal len:122 gc:universal CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1:1-363(+) | |
to: | |
>CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1 | |
""" | |
input: | |
pepfile = "{sample}.fasta.transdecoder_dir/longest_orfs.pep" | |
output: | |
renamed = temp("pepfiles/temp/{sample}_longest_orfs_renamed.pep") | |
shell: | |
""" | |
bioawk -cfastx '{{printf(">%s\\n%s\\n", $name, $seq)}}' {input.pepfile} > {output.renamed} | |
""" | |
# illumina rule 6 | |
rule move_and_multiline_peps: | |
""" | |
This moves the translated txome to its final resting place and | |
turns it from singleline to multiline. | |
""" | |
input: | |
pepfile = "pepfiles/temp/{sample}_longest_orfs_renamed.pep" | |
output: | |
pepfinal = "pepfiles/final/{sample}.pep" | |
params: | |
rmdir = lambda wildcards: "{}.fasta.transdecoder_dir".format(wildcards.sample), | |
checkpoints = lambda wildcards: "{}.fasta.transdecoder_dir.__checkpoints_longorfs".format(wildcards.sample) | |
run: | |
singleline_to_multiline(input.pepfile, output.pepfinal) | |
if os.path.exists(params.rmdir): | |
shutil.rmtree(params.rmdir) | |
if os.path.exists(params.checkpoints): | |
shutil.rmtree(params.checkpoints) | |
# illumina rule 7 fasta softlink to db | |
rule softlink_fasta_data: | |
input: | |
assem = "txomes/final/{sample}.fasta" | |
output: | |
outlink = "/data/ncbi/db/{sample}.fasta" | |
params: | |
absln = lambda wildcards: "{}/txomes/final/{}.fasta".format(curr_dir, wildcards.sample) | |
shell: | |
""" | |
ln -s {params.absln} {output.outlink} | |
""" | |
# illumina rule 8 pep softlink to db | |
rule softlink_pep_data: | |
input: | |
pep = "pepfiles/final/{sample}.pep" | |
output: | |
outlink = "/data/ncbi/db/{sample}.pep" | |
params: | |
absln = lambda wildcards: "{}/pepfiles/final/{}.pep".format(curr_dir, wildcards.sample) | |
shell: | |
""" | |
ln -s {params.absln} {output.outlink} | |
""" | |
# illumina rule 9 | |
rule nucl_db_data: | |
input: | |
inp = "/data/ncbi/db/{sample}.fasta" | |
output: | |
out = "/data/ncbi/db/{sample}.fasta.nhr" | |
shell: | |
""" | |
makeblastdb -in {input.inp} -input_type fasta -dbtype nucl -parse_seqids | |
""" | |
# illumina rule 10 | |
rule prot_db_data: | |
input: | |
inp = "/data/ncbi/db/{sample}.pep" | |
output: | |
out = "/data/ncbi/db/{sample}.pep.phr" | |
shell: | |
""" | |
makeblastdb -in {input.inp} -input_type fasta -dbtype prot -parse_seqids | |
""" | |
# ILLUMINA rule 11 | |
rule diamond_data: | |
input: | |
pepfile="/data/ncbi/db/{sample}.pep" | |
output: | |
ddb = "/data/ncbi/db/{sample}.dmnd" | |
shell: | |
""" | |
diamond makedb --in {input.pepfile} -d {output.ddb} | |
""" | |
# report | |
# this section handles writing a report on all of the samples. | |
# Fields to acquire: | |
# - number of reads in untrimmed | |
# - number of reads in trimmed | |
# - number of transcripts | |
# - number of peps | |
rule number_reads_untrimmed: | |
input: | |
raw = "reads/{sample}_f.fastq.gz" | |
output: | |
raw_count = "info/counts/raw/{sample}_raw_count.txt" | |
shell: | |
""" | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.raw}) > {output.raw_count} | |
""" | |
rule number_reads_trimmed: | |
input: | |
trimmed = "trimmed/{sample}_f.trim.fastq.gz" | |
output: | |
trimmed_count = "info/counts/trimmed/{sample}_trimmed_count.txt" | |
shell: | |
""" | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.trimmed}) > {output.trimmed_count} | |
""" | |
rule number_transcripts: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
fasta_counts = "info/counts/fasta/{sample}_fasta_count.txt" | |
shell: | |
""" | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.fasta}) > {output.fasta_counts} | |
""" | |
rule number_peps: | |
input: | |
pep = "pepfiles/final/{sample}.pep" | |
output: | |
pep_count = "info/counts/peps/{sample}_pep_count.txt" | |
shell: | |
""" | |
echo $(bioawk -cfastx 'END{{print NR}}' {input.pep}) > {output.pep_count} | |
""" | |
rule calcN50_tx: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
fasta_N50 = "info/counts/fasta/{sample}_N50.txt" | |
shell: | |
""" | |
bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \ | |
sort -n | \ | |
awk '{{len[i++]=$1;sum+=$1}} \ | |
END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \ | |
if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.fasta_N50} | |
""" | |
rule calcN90_tx: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
fasta_N90 = "info/counts/fasta/{sample}_N90.txt" | |
shell: | |
""" | |
bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \ | |
sort -n | \ | |
awk '{{len[i++]=$1;sum+=$1}} \ | |
END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \ | |
if (csum>=(sum*0.9)) {{print len[j];break}} }} }}' > {output.fasta_N90} | |
""" | |
rule calcN50_pep: | |
input: | |
pep = "pepfiles/final/{sample}.pep" | |
output: | |
pep_N50 = "info/counts/peps/{sample}_N50.txt" | |
shell: | |
""" | |
bioawk -cfastx '{{print(length($seq))}}' {input.pep} | \ | |
sort -n | \ | |
awk '{{len[i++]=$1;sum+=$1}} \ | |
END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \ | |
if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.pep_N50} | |
""" | |
rule quality_rejects: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
q_rejects = "info/trimming/txome/{sample}_qual_rejects.txt" | |
shell: | |
""" | |
echo "0" > {output.q_rejects} | |
""" | |
rule adapter_rejects: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
a_rejects = "info/trimming/txome/{sample}_adapter_rejects.txt" | |
shell: | |
""" | |
echo "0" > {output.a_rejects} | |
""" | |
rule GC_rejects: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
g_rejects = "info/trimming/txome/{sample}_gc_rejects.txt" | |
shell: | |
""" | |
echo "0" > {output.g_rejects} | |
""" | |
rule calcmean: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
fasta_meanlen = "info/counts/fasta/{sample}_meanlen.txt" | |
shell: | |
""" | |
bioawk -cfastx '{{sum += length($seq)}} \ | |
END{{printf("%.2f", sum/NR)}}' {input.fasta} > {output.fasta_meanlen} | |
""" | |
rule calcMB: | |
input: | |
fasta = "txomes/final/{sample}.fasta" | |
output: | |
fasta_numMB = "info/counts/fasta/{sample}_mb.txt" | |
shell: | |
""" | |
bioawk -cfastx '{{sum += length($seq)}} \ | |
END{{printf("%.2f", sum/1000000)}}' {input.fasta} > {output.fasta_numMB} | |
""" | |
rule collate_report: | |
""" this method prints out the final qc report of all the libraries.""" | |
input: | |
raw_num = expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]), | |
qual_reject = expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]), | |
adap_reject = expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]), | |
gc_reject = expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]), | |
trim_num = expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]), | |
txome_count = expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]), | |
pep_count = expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]), | |
txome_N50 = expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]), | |
pep_N50 = expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]), | |
tx_meanlen = expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]), | |
tx_N90 = expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]), | |
tx_MB = expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]), | |
output: | |
"report/final_report.txt" | |
run: | |
finalout_handle = open(output[0], "w") | |
#print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
"sample", | |
"trim_efficiency", | |
"num_reads", | |
"num_reads_trimmed", | |
"quality_rejects", | |
"adapter_rejects", | |
"gc_rejects", | |
"transcripts_count", | |
"transcripts_meanlen", | |
"transcripts_N50", | |
"transcripts_N90", | |
"transcripts_MB", | |
"peps_count", | |
"peps_N50"), | |
file = finalout_handle) | |
for thiss in sorted(config["samples"]): | |
raw_num = read_number_from_file("info/counts/raw/{}_raw_count.txt".format(thiss)) | |
trim_num = read_number_from_file("info/counts/trimmed/{}_trimmed_count.txt".format(thiss)) | |
txome_count = read_number_from_file("info/counts/fasta/{}_fasta_count.txt".format(thiss)) | |
pep_count = read_number_from_file("info/counts/peps/{}_pep_count.txt".format(thiss)) | |
txome_N50 = read_number_from_file("info/counts/fasta/{}_N50.txt".format(thiss)) | |
pep_N50 = read_number_from_file("info/counts/peps/{}_N50.txt".format(thiss)) | |
qual_reject = read_number_from_file("info/trimming/txome/{}_qual_rejects.txt".format(thiss)) | |
adap_reject = read_number_from_file("info/trimming/txome/{}_adapter_rejects.txt".format(thiss)) | |
gc_reject = read_number_from_file("info/trimming/txome/{}_gc_rejects.txt".format(thiss)) | |
tx_meanlen = read_number_from_file("info/counts/fasta/{}_meanlen.txt".format(thiss)) | |
txome_N90 = read_number_from_file("info/counts/fasta/{}_N90.txt".format(thiss)) | |
txome_MB = read_number_from_file("info/counts/fasta/{}_mb.txt".format(thiss)) | |
print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format( | |
thiss, | |
int(trim_num)/int(raw_num), | |
raw_num, | |
trim_num, | |
qual_reject, | |
adap_reject, | |
gc_reject, | |
txome_count, | |
tx_meanlen, | |
txome_N50, | |
txome_N90, | |
txome_MB, | |
pep_count, | |
pep_N50 | |
), file = finalout_handle) | |
finalout_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment