Last active
August 29, 2015 14:04
-
-
Save moonwatcher/c851834ed6a2d4a3b726 to your computer and use it in GitHub Desktop.
Manual demultiplexing for gencore
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
import sys | |
import os | |
import re | |
import json | |
from subprocess import Popen, PIPE | |
from argparse import ArgumentParser | |
MEMORY_REQUIREMENT = '64GB' | |
BCL2FASTQ_FOLDER = 'bcl2fastq' | |
QC_FOLDER = 'qc' | |
DEFAULT_CORES = 20 | |
PERL_PATH='/opt/perlbrew/perls/perl-5.14.4/bin/perl' | |
FASTQC_PATH='/opt/FastQC/fastqc' | |
CONFIGURE_BCLTOFASTQ_PATH = '/usr/local/bin/configureBclToFastq.pl' | |
#CONFIGURE_BCLTOFASTQ_PATH = '/share/apps/bcl2fastq/1.8.4/gnu/bin/configureBclToFastq.pl' | |
FASTQ_PATTERN = re.compile(r'(?P<name>.*)_(?P<barcode>[ATCG-]+|Undetermined)_L00(?P<lane>[1-8])_R(?P<read>[1-4])_(?P<frament>[0-9]{3})\.fastq\.gz') | |
def load_config(env): | |
config = None | |
if env['conf']: | |
path = os.path.realpath(os.path.expanduser(os.path.expandvars(env['conf']))) | |
if os.path.exists(path): | |
f = open(path, 'rb') | |
config = json.load(f) | |
f.close() | |
config['config path'] = path | |
config['base path'] = u'{}/{}'.format(config['output path'], config['name']) | |
config['bcl2fastq command'] = CONFIGURE_BCLTOFASTQ_PATH | |
config['sample sheet head'] = 'FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject' | |
config['bcl2fastq path'] = u'{}/{}'.format(config['base path'], BCL2FASTQ_FOLDER) | |
config['quality control path'] = u'{}/{}'.format(config['base path'], config['name']) | |
config['library report path'] = u'{}/{}'.format(config['quality control path'], 'library.csv') | |
config['lane report path'] = u'{}/{}'.format(config['quality control path'], 'lane.csv') | |
config['pbs path'] = u'{}/{}'.format(config['base path'], 'demultiplex.pbs') | |
config['sample sheet path'] = u'{}/{}'.format(config['base path'], 'SampleSheet.csv') | |
if 'cores' not in config: config['cores'] = DEFAULT_CORES | |
return config | |
def save_config(env, config): | |
if env['conf']: | |
path = os.path.realpath(os.path.expanduser(os.path.expandvars(env['conf']))) | |
f = open(path, 'w') | |
f.write(json.dumps(config, sort_keys=True, indent=4)) | |
f.close() | |
def make_pbs(config): | |
bcl2fastq_command = [ | |
PERL_PATH, | |
config['bcl2fastq command'], | |
'--input-dir {}/Data/Intensities/BaseCalls'.format(config['flowcell path']), | |
'--output-dir {}/Unaligned'.format(config['bcl2fastq path']), | |
'--sample-sheet {}'.format(config['sample sheet path']), | |
'--fastq-cluster-count 0', | |
'--no-eamss', | |
'--mismatches 1', | |
] | |
if config['broken']: | |
bcl2fastq_command.extend([ | |
'--ignore-missing-stats', | |
'--ignore-missing-bcl', | |
'--ignore-missing-control', | |
]) | |
# add parameters from config | |
bcl2fastq_command.append(config['configure bcl2fastq']) | |
content = [ | |
'#!/bin/sh', | |
'#PBS -V', | |
'#PBS -l nodes=1:ppn={},walltime=48:00:00'.format(config['cores']), | |
'#PBS -M {}'.format(config['email']), | |
'#PBS -m abe', | |
'#PBS -q s48', | |
'#PBS -l mem={}'.format(MEMORY_REQUIREMENT), | |
'#PBS -e {}/log.stderr'.format(config['base path']), | |
'#PBS -o {}/log.stdout'.format(config['base path']), | |
'', | |
'[ -n "$(export|grep MODULEPATH)" ] && (module purge; module load bcl2fastq/gnu/1.8.4; module load fastqc/0.10.1)', | |
'', | |
'{}\n'.format(' '.join(bcl2fastq_command)), | |
'cd {}/Unaligned'.format(config['bcl2fastq path']), | |
'make -j{}; result=$?\n'.format(config['cores']), | |
'for x in {}/Unaligned/*/*/*.fastq.gz; do {} --noextract --threads {} --outdir {} $x; done\n'.format( | |
config['bcl2fastq path'], | |
FASTQC_PATH, | |
config['cores'], | |
config['quality control path']), | |
'cd {} && for x in *.zip; do unzip -qx $x && rm $x; done\n'.format(config['quality control path']), | |
'rsync --recursive {}/Unaligned/Basecall_Stats_* {}/'.format(config['bcl2fastq path'], config['quality control path']), | |
'demux --conf {} report\n'.format(config['config path']), | |
'cd {} && tar -cjf {}.tar.bz2 {}\n'.format(config['base path'], config['name'], config['name']), | |
'exit $result\n', | |
] | |
f = open(config['pbs path'], 'w') | |
f.write(u'\n'.join(content)) | |
f.close() | |
def make_sample_sheet(config): | |
f = open(config['sample sheet path'], 'w') | |
f.write('{}\n'.format(config['sample sheet head'])) | |
f.write(u'\n'.join(config['sample sheet'])) | |
f.close() | |
def make_output_directories(config): | |
os.makedirs(config['bcl2fastq path']) | |
os.makedirs(config['quality control path']) | |
def list_fastq_files(config): | |
proc = Popen('ls {}/Unaligned/*/*/*.fastq.gz'.format(config['bcl2fastq path']), shell=True, stdout=PIPE, stderr=PIPE) | |
result = proc.communicate() | |
fastqs = result[0].splitlines() | |
return fastqs | |
def inspect_fastq_file(path): | |
node = None | |
match = FASTQ_PATTERN.search(os.path.basename(path)) | |
if match: | |
node = match.groupdict() | |
node['path'] = path | |
proc = Popen('zcat {} | wc -l'.format(path), shell=True, stdout=PIPE, stderr=PIPE) | |
result = proc.communicate() | |
node['count'] = int(result[0]) / 4 | |
return node | |
def print_library_report(config, report): | |
f = open(config['library report path'], 'w') | |
print(u'{},{},{},{},{}'.format('name', 'lane', 'read', 'count', 'portion'), file=f) | |
for lane in report['lane'].values(): | |
for fastq in lane['library']: | |
print(u'{},{},{},{},{}'.format(fastq['name'], fastq['lane'], fastq['read'], fastq['count'], fastq['portion']), file=f) | |
f.close() | |
def print_lane_report(config, report): | |
f = open(config['lane report path'], 'w') | |
print (u'{},{},{},{},{},{}'.format('lane', 'count', 'undetermined count', 'undetermined portion', 'demultiplexed count', 'demultiplexed portion'), file=f) | |
for lane in report['lane'].values(): | |
print (u'{},{},{},{},{},{}'.format(lane['number'], lane['count'], lane['undetermined count'], lane['undetermined portion'], lane['demultiplexed count'], lane['demultiplexed portion']), file=f) | |
f.close() | |
def compile_report(config): | |
report = { | |
'fastq':[], | |
'lane':{}, | |
'count':0, | |
'undetermined count':0, | |
'undetermined portion':0.0, | |
'demultiplexed count':0, | |
'demultiplexed portion':0.0, | |
} | |
paths = list_fastq_files(config) | |
for path in paths: | |
node = inspect_fastq_file(path) | |
if node: | |
report['fastq'].append(node) | |
for fastq in report['fastq']: | |
if fastq['lane'] not in report['lane']: | |
report['lane'][fastq['lane']] = { | |
'number':fastq['lane'], | |
'library':[], | |
'count':0, | |
'undetermined count':0, | |
'undetermined portion':0.0, | |
'demultiplexed count':0, | |
'demultiplexed portion':0.0, | |
} | |
report['lane'][fastq['lane']]['library'].append(fastq) | |
if fastq['read'] == '1': | |
report['lane'][fastq['lane']]['count'] += fastq['count'] | |
for lane in report['lane'].values(): | |
for fastq in lane['library']: | |
fastq['portion'] = float(fastq['count']) / float(lane['count']) | |
if fastq['barcode'] == 'Undetermined' and fastq['read'] == '1': | |
lane['undetermined count'] = fastq['count'] | |
lane['undetermined portion'] = fastq['portion'] | |
lane['demultiplexed portion'] = 1.0 - fastq['portion'] | |
for lane in report['lane'].values(): | |
report['count'] += lane['count'] | |
report['undetermined count'] += lane['undetermined count'] | |
report['demultiplexed count'] += lane['demultiplexed count'] | |
report['undetermined portion'] = float(report['undetermined count']) / float(report['count']) | |
report['demultiplexed portion'] = float(report['demultiplexed count']) / float(report['count']) | |
del report['fastq'] | |
config['report'] = report | |
def print_report(config): | |
if 'report' not in config: | |
compile_report(config) | |
print_lane_report(config, config['report']) | |
print_library_report(config, config['report']) | |
def submit_pbs(config): | |
proc = Popen('qsub -N {}-Demux {}'.format(config['name'], config['pbs path']), shell=True, stdout=PIPE, stderr=PIPE) | |
proc.communicate() | |
def run(config): | |
proc = Popen('cat {}|bash'.format(config['pbs path']), shell=True) | |
proc.communicate() | |
def decode_cli(): | |
env = {} | |
p = ArgumentParser() | |
p.add_argument('-c', '--conf', required=True, metavar='PATH', dest='conf', help='Path to configuration file') | |
# -- sub parsers for each action -- | |
s = p.add_subparsers(dest='action') | |
c = s.add_parser( 'prepare', help='prepare demux', | |
description='prepare for demultiplexing flowcell' | |
) | |
c.add_argument('-b', '--broken', dest='broken', action='store_true', default=False, | |
help='handle broken runs with missing or corrupted files. \ | |
This will add the ignore-missing-stats, ignore-missing-bcl and ignore-missing-control flags to bcl2fastq.') | |
c = s.add_parser( 'submit', help='submit pbs job', | |
description='submit a pbs job for demultiplexing the flowcell' | |
) | |
c = s.add_parser( 'run', help='execute the pbs script locally', | |
description='execut the pbs script in a child shell on the executing machine.' | |
) | |
c = s.add_parser( 'report', help='compile a demux report', | |
description='calculate read counts and distribution of libraries in lanes' | |
) | |
for k,v in vars(p.parse_args()).iteritems(): | |
if v is not None: env[k] = v | |
return env | |
def main(): | |
env = decode_cli() | |
config = load_config(env) | |
if env['action'] == 'prepare': | |
config['broken'] = env['broken'] | |
make_output_directories(config) | |
make_sample_sheet(config) | |
make_pbs(config) | |
if env['action'] == 'submit': | |
submit_pbs(config) | |
if env['action'] == 'run': | |
run(config) | |
if env['action'] == 'report': | |
print_report(config) | |
save_config(env, config) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment