ccwang002 · November 17, 2017 19:36
diff --git a/Snakefile b/Snakefile
 from pathlib import Path
 from snakemake.remote.GS import RemoteProvider as GSRemoteProvider
 GS = GSRemoteProvider()


 GS_PREFIX = "lbwang-playground/snakemake_rnaseq"
 GENOME_FA =  GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa")
 GENOME_GTF = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf")
 HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"
 FULL_HISAT2_INDEX_PREFIX = "dinglab/lbwang/snakemake_demo/hisat2_index/chr22_ERCC92"

 # # Use GS.glob_wildcards only after the following issues is resolved
 # #     https://bitbucket.org/snakemake/snakemake/issues/700
 # SAMPLES, *_ = GS.glob_wildcards(GS_PREFIX + '/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz')
 SAMPLES = [
    'HBR_Rep1_ERCC-Mix2_Build37-ErccTranscripts-chr22',
    'HBR_Rep2_ERCC-Mix2_Build37-ErccTranscripts-chr22',
    'HBR_Rep3_ERCC-Mix2_Build37-ErccTranscripts-chr22',
    'UHR_Rep1_ERCC-Mix1_Build37-ErccTranscripts-chr22',
    'UHR_Rep2_ERCC-Mix1_Build37-ErccTranscripts-chr22',
    'UHR_Rep3_ERCC-Mix1_Build37-ErccTranscripts-chr22',
 ]


 rule extract_genome_splice_sites:
    input: GENOME_GTF
    output: "hisat2_index/chr22_ERCC92.ss"
    shell: "hisat2_extract_splice_sites.py {input} > {output}"


 rule extract_genome_exons:
    input: GENOME_GTF
    output: "hisat2_index/chr22_ERCC92.exon"
    shell: "hisat2_extract_exons.py {input} > {output}"


 rule build_hisat_index:
    input:
        genome_fa=GENOME_FA,
        splice_sites="hisat2_index/chr22_ERCC92.ss",
        exons="hisat2_index/chr22_ERCC92.exon",
    output: expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9))
    log: "hisat2_index/build.log"
    threads: 8
    shell:
        "hisat2-build -p {threads} {input.genome_fa} "
        "--ss {input.splice_sites} --exon {input.exons} {FULL_HISAT2_INDEX_PREFIX} "
        "2>{log}"


 rule align_hisat:
    input:
        hisat2_index=expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)),
        fastq1=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz"),
        fastq2=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read2.fastq.gz"),
    output: "align_hisat2/{sample}.bam"
    log: "align_hisat2/{sample}.log"
    threads: 4
    shell:
        "hisat2 -p {threads} --dta -x {FULL_HISAT2_INDEX_PREFIX} "
        "-1 {input.fastq1} -2 {input.fastq2} 2>{log} | "
        "samtools sort -@ {threads} -o {output}"


 rule align_all_samples:
    input: expand("align_hisat2/{sample}.bam", sample=SAMPLES)


 rule stringtie_assemble:
    input:
        genome_gtf=GENOME_GTF,
        bam="align_hisat2/{sample}.bam"
    output: "stringtie/assembled/{sample}.gtf"
    threads: 4
    shell:
        "stringtie -p {threads} -G {input.genome_gtf} "
        "-o {output} -l {wildcards.sample} {input.bam}"


 rule stringtie_merge_list:
    input: expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES)
    output: "stringtie/merged_list.txt"
    run:
        with open(output[0], 'w') as f:
            for gtf in input:
                print(Path(gtf).resolve(), file=f)


 rule stringtie_merge:
    input:
        genome_gtf=GENOME_GTF,
        merged_list="stringtie/merged_list.txt",
        sample_gtfs=expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES)
    output: "stringtie/merged.gtf"
    threads: 4
    shell:
        "stringtie --merge -p {threads} -G {input.genome_gtf} "
        "-o {output} {input.merged_list}"


 rule stringtie_quant:
    input:
        merged_gtf="stringtie/merged.gtf",
        sample_bam="align_hisat2/{sample}.bam"
    output:
        gtf="stringtie/quant/{sample}/{sample}.gtf",
        ctabs=expand(
            "stringtie/quant/{{sample}}/{name}.ctab",
            name=['i2t', 'e2t', 'i_data', 'e_data', 't_data']
        )
    threads: 4
    shell:
        "stringtie -e -B -p {threads} -G {input.merged_gtf} "
        "-o {output.gtf} {input.sample_bam}"


 rule quant_all_samples:
    input: expand("stringtie/quant/{sample}/{sample}.gtf", sample=SAMPLES)


 rule clean:
    shell:
        "rm -rf align_hisat2 hisat2_index stringtie"
	from pathlib import Path
	from snakemake.remote.GS import RemoteProvider as GSRemoteProvider
	GS = GSRemoteProvider()


	GS_PREFIX = "lbwang-playground/snakemake_rnaseq"
	GENOME_FA = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa")
	GENOME_GTF = GS.remote(f"{GS_PREFIX}/griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf")
	HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92"
	FULL_HISAT2_INDEX_PREFIX = "dinglab/lbwang/snakemake_demo/hisat2_index/chr22_ERCC92"

	# # Use GS.glob_wildcards only after the following issues is resolved
	# # https://bitbucket.org/snakemake/snakemake/issues/700
	# SAMPLES, *_ = GS.glob_wildcards(GS_PREFIX + '/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz')
	SAMPLES = [
	'HBR_Rep1_ERCC-Mix2_Build37-ErccTranscripts-chr22',
	'HBR_Rep2_ERCC-Mix2_Build37-ErccTranscripts-chr22',
	'HBR_Rep3_ERCC-Mix2_Build37-ErccTranscripts-chr22',
	'UHR_Rep1_ERCC-Mix1_Build37-ErccTranscripts-chr22',
	'UHR_Rep2_ERCC-Mix1_Build37-ErccTranscripts-chr22',
	'UHR_Rep3_ERCC-Mix1_Build37-ErccTranscripts-chr22',
	]


	rule extract_genome_splice_sites:
	input: GENOME_GTF
	output: "hisat2_index/chr22_ERCC92.ss"
	shell: "hisat2_extract_splice_sites.py {input} > {output}"


	rule extract_genome_exons:
	input: GENOME_GTF
	output: "hisat2_index/chr22_ERCC92.exon"
	shell: "hisat2_extract_exons.py {input} > {output}"


	rule build_hisat_index:
	input:
	genome_fa=GENOME_FA,
	splice_sites="hisat2_index/chr22_ERCC92.ss",
	exons="hisat2_index/chr22_ERCC92.exon",
	output: expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9))
	log: "hisat2_index/build.log"
	threads: 8
	shell:
	"hisat2-build -p {threads} {input.genome_fa} "
	"--ss {input.splice_sites} --exon {input.exons} {FULL_HISAT2_INDEX_PREFIX} "
	"2>{log}"


	rule align_hisat:
	input:
	hisat2_index=expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)),
	fastq1=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz"),
	fastq2=GS.remote(GS_PREFIX + "/griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read2.fastq.gz"),
	output: "align_hisat2/{sample}.bam"
	log: "align_hisat2/{sample}.log"
	threads: 4
	shell:
	"hisat2 -p {threads} --dta -x {FULL_HISAT2_INDEX_PREFIX} "
	"-1 {input.fastq1} -2 {input.fastq2} 2>{log} \| "
	"samtools sort -@ {threads} -o {output}"


	rule align_all_samples:
	input: expand("align_hisat2/{sample}.bam", sample=SAMPLES)


	rule stringtie_assemble:
	input:
	genome_gtf=GENOME_GTF,
	bam="align_hisat2/{sample}.bam"
	output: "stringtie/assembled/{sample}.gtf"
	threads: 4
	shell:
	"stringtie -p {threads} -G {input.genome_gtf} "
	"-o {output} -l {wildcards.sample} {input.bam}"


	rule stringtie_merge_list:
	input: expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES)
	output: "stringtie/merged_list.txt"
	run:
	with open(output[0], 'w') as f:
	for gtf in input:
	print(Path(gtf).resolve(), file=f)


	rule stringtie_merge:
	input:
	genome_gtf=GENOME_GTF,
	merged_list="stringtie/merged_list.txt",
	sample_gtfs=expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES)
	output: "stringtie/merged.gtf"
	threads: 4
	shell:
	"stringtie --merge -p {threads} -G {input.genome_gtf} "
	"-o {output} {input.merged_list}"


	rule stringtie_quant:
	input:
	merged_gtf="stringtie/merged.gtf",
	sample_bam="align_hisat2/{sample}.bam"
	output:
	gtf="stringtie/quant/{sample}/{sample}.gtf",
	ctabs=expand(
	"stringtie/quant/{{sample}}/{name}.ctab",
	name=['i2t', 'e2t', 'i_data', 'e_data', 't_data']
	)
	threads: 4
	shell:
	"stringtie -e -B -p {threads} -G {input.merged_gtf} "
	"-o {output.gtf} {input.sample_bam}"


	rule quant_all_samples:
	input: expand("stringtie/quant/{sample}/{sample}.gtf", sample=SAMPLES)


	rule clean:
	shell:
	"rm -rf align_hisat2 hisat2_index stringtie"