Last active
August 22, 2019 16:42
-
-
Save ccwang002/2659b19439b6205284c0ae68ca06345d to your computer and use it in GitHub Desktop.
Example Snakefile for the new Tuxedo RNA-Seq pipeline (local)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The Snakefile that loads raw data and genome reference locally | |
GENOME_FA = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa" | |
GENOME_GTF = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf" | |
HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92" | |
SAMPLES, *_ = glob_wildcards('griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz') | |
from pathlib import Path | |
rule extract_genome_splice_sites: | |
input: GENOME_GTF | |
output: "hisat2_index/chr22_ERCC92.ss" | |
shell: "hisat2_extract_splice_sites.py {input} > {output}" | |
rule extract_genome_exons: | |
input: GENOME_GTF | |
output: "hisat2_index/chr22_ERCC92.exon" | |
shell: "hisat2_extract_exons.py {input} > {output}" | |
rule build_hisat_index: | |
input: | |
genome_fa=GENOME_FA, | |
splice_sites="hisat2_index/chr22_ERCC92.ss", | |
exons="hisat2_index/chr22_ERCC92.exon", | |
output: expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)) | |
log: "hisat2_index/build.log" | |
threads: 8 | |
shell: | |
"hisat2-build -p {threads} {input.genome_fa} " | |
"--ss {input.splice_sites} --exon {input.exons} {HISAT2_INDEX_PREFIX} " | |
"2>{log}" | |
rule align_hisat: | |
input: | |
hisat2_index=expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)), | |
fastq1="griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz", | |
fastq2="griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read2.fastq.gz", | |
output: "align_hisat2/{sample}.bam" | |
log: "align_hisat2/{sample}.log" | |
threads: 4 | |
shell: | |
"hisat2 -p {threads} --dta -x {HISAT2_INDEX_PREFIX} " | |
"-1 {input.fastq1} -2 {input.fastq2} 2>{log} | " | |
"samtools sort -@ {threads} -o {output}" | |
rule align_all_samples: | |
input: expand("align_hisat2/{sample}.bam", sample=SAMPLES) | |
rule stringtie_assemble: | |
input: | |
genome_gtf=GENOME_GTF, | |
bam="align_hisat2/{sample}.bam" | |
output: "stringtie/assembled/{sample}.gtf" | |
threads: 4 | |
shell: | |
"stringtie -p {threads} -G {input.genome_gtf} " | |
"-o {output} -l {wildcards.sample} {input.bam}" | |
rule stringtie_merge_list: | |
input: expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) | |
output: "stringtie/merged_list.txt" | |
run: | |
with open(output[0], 'w') as f: | |
for gtf in input: | |
print(Path(gtf).resolve(), file=f) | |
rule stringtie_merge: | |
input: | |
genome_gtf=GENOME_GTF, | |
merged_list="stringtie/merged_list.txt", | |
sample_gtfs=expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) | |
output: "stringtie/merged.gtf" | |
threads: 4 | |
shell: | |
"stringtie --merge -p {threads} -G {input.genome_gtf} " | |
"-o {output} {input.merged_list}" | |
rule stringtie_quant: | |
input: | |
merged_gtf="stringtie/merged.gtf", | |
sample_bam="align_hisat2/{sample}.bam" | |
output: | |
gtf="stringtie/quant/{sample}/{sample}.gtf", | |
gene_abund_tab="stringtie/quant/{sample}/gene_abund.tab", | |
ctabs=expand( | |
"stringtie/quant/{{sample}}/{name}.ctab", | |
name=['i2t', 'e2t', 'i_data', 'e_data', 't_data'] | |
) | |
threads: 4 | |
shell: | |
"stringtie -e -B -p {threads} -G {input.merged_gtf} " | |
"-A {output.gene_abund_tab} -o {output.gtf} {input.sample_bam}" | |
rule quant_all_samples: | |
input: expand("stringtie/quant/{sample}/{sample}.gtf", sample=SAMPLES) | |
rule clean: | |
shell: | |
"rm -rf align_hisat2 hisat2_index stringtie" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment