Last active
October 11, 2016 20:53
-
-
Save sxv/6cc5142b8e0ac7502a8acff30f3f7240 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define input cases via master_key.txt | |
# for each run, align fastq -> bam | |
# if GATK specified, run GATK, variant calling, and annovar | |
# run matricizer2 on tab output | |
# todo: add R clustering scripts | |
## usage | |
# python rnarunner.py --input-dir /path/to/fastqs | |
## NOTES: input-dir must contain master_key.txt, input fastq.gz files, matricizer2.py, UCSC_GENE_NAME3.txt | |
import time, os, argparse | |
from glob import glob | |
now = int(time.time()) | |
parser = argparse.ArgumentParser(description='RNA sample pipeline runner') | |
parser.add_argument('--input-dir', default='./', required=False) | |
parser.add_argument('--master-key', default='master_key.txt', required=False) | |
parser.add_argument('--batch-name', required=False) | |
parser.add_argument('--threads', default='8', required=False) | |
parser.add_argument('--star', default='/media/lab/data256gb/STAR-STAR_2.4.2a/bin/Linux_x86_64', required=False) | |
parser.add_argument('--star-ref', default='/media/lab/data3tb_1/STAR_ref/', required=False) | |
parser.add_argument('--genome', default='hg19/GRCh37.primary_assembly.reordered.fa', required=False) | |
parser.add_argument('--picard', default='/media/lab/data512gb/bcbio/share/java/picard-1.96', required=False) | |
parser.add_argument('--gatk', default='/media/lab/data256gb/bcbio/GenomeAnalysisTK.jar', required=False) | |
parser.add_argument('--varscan', default='/media/lab/data256gb/bcbio/share/java/varscan-2.3.7/', required=False) | |
parser.add_argument('--annovar', default='/media/lab/data3tb_2/annovar/', required=False) | |
parser.add_argument('--star-genome', default='/media/lab/data256gb/STAR-STAR_2.4.2a/star_genome', required=False) | |
#parser.add_argument('--matrix-config', default='False', required=False) | |
args = vars(parser.parse_args()) | |
Mills = 'Mills_and_1000G_gold_standard.indels.b37.sites.gencode.vcf' | |
upload_dir = '/mmedia/stroma-common/rna.seq/complete/' | |
def main(): | |
cases = queue_cases() | |
for case_name in cases: | |
if case_name in ['skipme']: | |
print 'Skipping already completed case: %s' % case_name | |
continue | |
# make case dir, | |
case_dir = "%s/%s" % (args['input_dir'], case_name) | |
os.system("bash -c 'mkdir %s'" % case_dir) | |
bam_path = "%s/%s.bam" % (case_dir, case_name) | |
batch_name = "%s.%s" % (args['batch_name'], now) | |
samples = CASE_DICT[case_name] | |
if not glob(bam_path): | |
run_star(case_dir, samples, bam_path) | |
if not glob('%s/*ReadsPerGene.txt' % case_dir): | |
run_htseq_count(bam_path, case_name) | |
#vcf_path = "%s/%s.vcf" % (case_dir, case_name) | |
#if not glob(vcf_path): | |
# run_gatk(case_dir, case_name, bam_path, vcf_path) | |
#if glob(vcf_path): | |
# os.system("bash -c 'rm %s/out*ba?'" % case_dir) | |
# os.system("bash -c 'mv %s %s'" % (case_dir, upload_dir)) | |
#annotate(batch_name) #vcfgenie | |
#matricizer(batch_name) ON READSPERGENE Files in each case_dir | |
#matricizer2.py needs to be changed to matricize raw read counts (d/t new format of htseq vs star) | |
def find_read_pair(path1): | |
if os.path.isfile(path1): | |
if 'pf.fastq' in path1: | |
path2 = path1.replace('1_pf.fastq', '2_pf.fastq') | |
else: | |
path2 = path1.replace('1.fastq', '2.fastq') | |
if os.path.isfile(path2): | |
print "FOUND pair for %s: %s" % (path1, path2) | |
return path2 | |
else: print "No pair found for %s" % path1 | |
else: print "%s not a valid file" % path1 | |
return False | |
CASE_DICT = {} | |
MASTER_KEY = "%s/%s" % (args['input_dir'], args['master_key']) | |
def queue_cases(): | |
cases = [] | |
for line in open(MASTER_KEY): | |
if not line.startswith('#'): | |
if line.split(): | |
fastq, case_name = line.split() | |
print 'fastq: %s ; case_name: %s' % (fastq, case_name) | |
cases.append(case_name) | |
fastq2 = False | |
if '1.fastq' in fastq or '1_pf.fastq' in fastq: | |
fastq2 = find_read_pair(fastq) | |
else: print '1.fastq not found' | |
if fastq2: CASE_DICT[case_name] = "%s %s" % (fastq, fastq2) | |
else: CASE_DICT[case_name] = fastq | |
print "%s cases queued: \n%s" % (len(cases), CASE_DICT) | |
return cases | |
def run_star(case_dir, samples, bam_path): | |
star_command = "cd %s && %s/STAR --genomeDir %s --readFilesIn %s --runThreadN %s --quantMode TranscriptomeSAM GeneCounts --outReadsUnmapped None --outFilterMultimapNmax 1 --outFilterScoreMin 20 --outSAMtype BAM SortedByCoordinate --sjdbGTFfile %s/hg19/gencode.v24lift37.annotation.gtf --readFilesCommand zcat" % (case_dir, args['star'], args['star_genome'], samples, args['threads'], args['star_ref']) | |
print star_command | |
os.system("bash -c '%s'" % star_command) | |
os.system("bash -c 'mv %s/Aligned.sortedByCoord.out.bam %s'" % (case_dir, bam_path)) | |
def run_htseq_count(bam, sample): | |
#os.system("bash -c 'htseq-count -r pos -f bam %s /media/lab/data3tb_1/STAR_ref/gencode.v24lift37.annotation.gff3 > %s/%s.ReadsPerGene.txt'" % (bam, sample, sample)) | |
os.system("bash -c 'htseq-count -f bam %s /media/lab/data3tb_1/STAR_ref/gencode.v24lift37.annotation.gff3 > %s/%s.ReadsPerGene.txt'" % (bam, sample, sample)) | |
def matricizer(batch_name): | |
os.system("bash -c 'python ./matricizer2.py > %s/%s.matrix.txt'" % (args['input_dir'], batch_name)) | |
def annotate(batch_name): | |
os.system("""bash -c 'ssh [email protected] "mkdir -p /media/joannaprzybyl/SSD-Data/vcfgenie/input/%s"'""" % batch_name) | |
os.system("bash -c 'scp %s/*/*.vcf [email protected]:/media/joannaprzybyl/SSD-Data/vcfgenie/input/%s/'" % (args['input_dir'], batch_name)) | |
def run_gatk(case_dir, case_name, bam, vcf_path): | |
if not glob('%s/out.bam' % case_dir): | |
os.system("bash -c 'java -jar %s/AddOrReplaceReadGroups.jar I=%s O=%s/out.bam SO=coordinate RGID=test RGLB=test RGPL=Illumina RGPU=machine RGSM=test/'" % (args['picard'], bam, case_dir)) | |
if not glob('%s/out2.bam' % case_dir): | |
os.system("bash -c 'java -jar %s/MarkDuplicates.jar I=%s/out.bam O=%s/out2.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics AS=TRUE REMOVE_DUPLICATES=TRUE'" % (args['picard'], case_dir, case_dir)) | |
if not glob('%s/out2a*bai' % case_dir): | |
os.system("bash -c 'java -jar /media/lab/data512gb/bcbio/share/java/picard-1.96/ReorderSam.jar I=%s/out2.bam O=%s/out2a.bam REFERENCE=%s/%s'" % (case_dir, case_dir, args['star_ref'], args['genome'])) # ~15minutes | |
os.system("bash -c 'samtools index %s/out2a.bam'" % case_dir) | |
#*** change -R path | |
if not glob('%s/out3.bam' % case_dir): | |
os.system("bash -c 'java -jar %s -T SplitNCigarReads -R %s/%s -I %s/out2a.bam -o %s/out3.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_dir)) | |
#*** -R path and -known path #GATK RealignerTargetCreator #add Mills Gold Standard vcf | |
if not glob('%s/forIndelRealigner.intervals' % case_dir): | |
os.system("bash -c 'java -Xmx2g -jar %s -T RealignerTargetCreator -R %s/%s -nt %s -I %s/out3.bam -o %s/forIndelRealigner.intervals -known %s/%s'" % (args['gatk'], args['star_ref'], args['genome'], args['threads'], case_dir, case_dir, args['star_ref'], Mills)) | |
#*** -R path and -known path #GATK IndelRealigner #add Mills Gold Standard vcf | |
if not glob('%s/out4.bam' % case_dir): | |
os.system("bash -c 'java -Xmx4g -jar %s -T IndelRealigner -R %s/%s -I %s/out3.bam -targetIntervals %s/forIndelRealigner.intervals -o %s/out4.bam -known %s/%s'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_dir, case_dir, args['star_ref'], Mills)) | |
#*** -R path and -known path #GATK base recalibration #recommend latest version dbsnp...146.dbsnp.vcf | |
if not glob('%s/recalibration_report.grp' % case_dir): | |
os.system("bash -c 'java -Xmx4g -jar %s -T BaseRecalibrator -nct %s -I %s/out4.bam -R %s/%s -knownSites %s/%s -o %s/recalibration_report.grp'" % (args['gatk'], args['threads'], case_dir, args['star_ref'], args['genome'], args['star_ref'], Mills, case_dir)) | |
#*** -R path | |
if not glob('%s/*final.bam' % case_dir): | |
os.system("bash -c 'java -jar %s -T PrintReads -R %s/%s -nct %s -I %s/out4.bam -BQSR %s/recalibration_report.grp -o %s/%s.final.bam'" % (args['gatk'], args['star_ref'], args['genome'], args['threads'], case_dir, case_dir, case_dir, case_name)) | |
#1 core | |
#nohup bash $SCRIPTS/mpileup_varscan.sh $1 > ./nohup_mpileup_varscan.txt & | |
#Haplotype caller (RNA-seq mode) #*** -R path | |
if not glob(vcf_path): | |
os.system("bash -c 'java -jar %s -T HaplotypeCaller -R %s/%s -I %s/%s.final.bam -dontUseSoftClippedBases -nct %s -stand_call_conf 20.0 -stand_emit_conf 20.0 -o %s'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_name, args['threads'], vcf_path)) | |
if __name__ == "__main__": main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment