Last active
November 17, 2017 19:36
-
-
Save ccwang002/2686840e90574a67a673ec4b48e9f036 to your computer and use it in GitHub Desktop.
Example Snakefile for the new Tuxedo RNA-Seq pipeline (remote on Google Cloud Storage)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from snakemake.remote.GS import RemoteProvider as GSRemoteProvider | |
GS = GSRemoteProvider() | |
GS_PREFIX = "lbwang-playground/snakemake_rnaseq" | |
GENOME_FA = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa") | |
GENOME_GTF = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf") | |
HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92" | |
FULL_HISAT2_INDEX_PREFIX = "dinglab/lbwang/snakemake_demo/hisat2_index/chr22_ERCC92" | |
# # Use GS.glob_wildcards only after the following issues is resolved | |
# # https://bitbucket.org/snakemake/snakemake/issues/700 | |
# SAMPLES, *_ = GS.glob_wildcards(GS_PREFIX + '/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz') | |
SAMPLES = [ | |
'HBR_Rep1_ERCC-Mix2_Build37-ErccTranscripts-chr22', | |
'HBR_Rep2_ERCC-Mix2_Build37-ErccTranscripts-chr22', | |
'HBR_Rep3_ERCC-Mix2_Build37-ErccTranscripts-chr22', | |
'UHR_Rep1_ERCC-Mix1_Build37-ErccTranscripts-chr22', | |
'UHR_Rep2_ERCC-Mix1_Build37-ErccTranscripts-chr22', | |
'UHR_Rep3_ERCC-Mix1_Build37-ErccTranscripts-chr22', | |
] | |
rule extract_genome_splice_sites: | |
input: GENOME_GTF | |
output: "hisat2_index/chr22_ERCC92.ss" | |
shell: "hisat2_extract_splice_sites.py {input} > {output}" | |
rule extract_genome_exons: | |
input: GENOME_GTF | |
output: "hisat2_index/chr22_ERCC92.exon" | |
shell: "hisat2_extract_exons.py {input} > {output}" | |
rule build_hisat_index: | |
input: | |
genome_fa=GENOME_FA, | |
splice_sites="hisat2_index/chr22_ERCC92.ss", | |
exons="hisat2_index/chr22_ERCC92.exon", | |
output: expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)) | |
log: "hisat2_index/build.log" | |
threads: 8 | |
shell: | |
"hisat2-build -p {threads} {input.genome_fa} " | |
"--ss {input.splice_sites} --exon {input.exons} {FULL_HISAT2_INDEX_PREFIX} " | |
"2>{log}" | |
rule align_hisat: | |
input: | |
hisat2_index=expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)), | |
fastq1=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz"), | |
fastq2=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read2.fastq.gz"), | |
output: "align_hisat2/{sample}.bam" | |
log: "align_hisat2/{sample}.log" | |
threads: 4 | |
shell: | |
"hisat2 -p {threads} --dta -x {FULL_HISAT2_INDEX_PREFIX} " | |
"-1 {input.fastq1} -2 {input.fastq2} 2>{log} | " | |
"samtools sort -@ {threads} -o {output}" | |
rule align_all_samples: | |
input: expand("align_hisat2/{sample}.bam", sample=SAMPLES) | |
rule stringtie_assemble: | |
input: | |
genome_gtf=GENOME_GTF, | |
bam="align_hisat2/{sample}.bam" | |
output: "stringtie/assembled/{sample}.gtf" | |
threads: 4 | |
shell: | |
"stringtie -p {threads} -G {input.genome_gtf} " | |
"-o {output} -l {wildcards.sample} {input.bam}" | |
rule stringtie_merge_list: | |
input: expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) | |
output: "stringtie/merged_list.txt" | |
run: | |
with open(output[0], 'w') as f: | |
for gtf in input: | |
print(Path(gtf).resolve(), file=f) | |
rule stringtie_merge: | |
input: | |
genome_gtf=GENOME_GTF, | |
merged_list="stringtie/merged_list.txt", | |
sample_gtfs=expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) | |
output: "stringtie/merged.gtf" | |
threads: 4 | |
shell: | |
"stringtie --merge -p {threads} -G {input.genome_gtf} " | |
"-o {output} {input.merged_list}" | |
rule stringtie_quant: | |
input: | |
merged_gtf="stringtie/merged.gtf", | |
sample_bam="align_hisat2/{sample}.bam" | |
output: | |
gtf="stringtie/quant/{sample}/{sample}.gtf", | |
ctabs=expand( | |
"stringtie/quant/{{sample}}/{name}.ctab", | |
name=['i2t', 'e2t', 'i_data', 'e_data', 't_data'] | |
) | |
threads: 4 | |
shell: | |
"stringtie -e -B -p {threads} -G {input.merged_gtf} " | |
"-o {output.gtf} {input.sample_bam}" | |
rule quant_all_samples: | |
input: expand("stringtie/quant/{sample}/{sample}.gtf", sample=SAMPLES) | |
rule clean: | |
shell: | |
"rm -rf align_hisat2 hisat2_index stringtie" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment