Skip to content

Instantly share code, notes, and snippets.

@moonwatcher
Last active August 29, 2015 14:04
Show Gist options
  • Save moonwatcher/c851834ed6a2d4a3b726 to your computer and use it in GitHub Desktop.
Save moonwatcher/c851834ed6a2d4a3b726 to your computer and use it in GitHub Desktop.
Manual demultiplexing for gencore
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import os
import re
import json
from subprocess import Popen, PIPE
from argparse import ArgumentParser
MEMORY_REQUIREMENT = '64GB'
BCL2FASTQ_FOLDER = 'bcl2fastq'
QC_FOLDER = 'qc'
DEFAULT_CORES = 20
PERL_PATH='/opt/perlbrew/perls/perl-5.14.4/bin/perl'
FASTQC_PATH='/opt/FastQC/fastqc'
CONFIGURE_BCLTOFASTQ_PATH = '/usr/local/bin/configureBclToFastq.pl'
#CONFIGURE_BCLTOFASTQ_PATH = '/share/apps/bcl2fastq/1.8.4/gnu/bin/configureBclToFastq.pl'
FASTQ_PATTERN = re.compile(r'(?P<name>.*)_(?P<barcode>[ATCG-]+|Undetermined)_L00(?P<lane>[1-8])_R(?P<read>[1-4])_(?P<frament>[0-9]{3})\.fastq\.gz')
def load_config(env):
config = None
if env['conf']:
path = os.path.realpath(os.path.expanduser(os.path.expandvars(env['conf'])))
if os.path.exists(path):
f = open(path, 'rb')
config = json.load(f)
f.close()
config['config path'] = path
config['base path'] = u'{}/{}'.format(config['output path'], config['name'])
config['bcl2fastq command'] = CONFIGURE_BCLTOFASTQ_PATH
config['sample sheet head'] = 'FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject'
config['bcl2fastq path'] = u'{}/{}'.format(config['base path'], BCL2FASTQ_FOLDER)
config['quality control path'] = u'{}/{}'.format(config['base path'], config['name'])
config['library report path'] = u'{}/{}'.format(config['quality control path'], 'library.csv')
config['lane report path'] = u'{}/{}'.format(config['quality control path'], 'lane.csv')
config['pbs path'] = u'{}/{}'.format(config['base path'], 'demultiplex.pbs')
config['sample sheet path'] = u'{}/{}'.format(config['base path'], 'SampleSheet.csv')
if 'cores' not in config: config['cores'] = DEFAULT_CORES
return config
def save_config(env, config):
if env['conf']:
path = os.path.realpath(os.path.expanduser(os.path.expandvars(env['conf'])))
f = open(path, 'w')
f.write(json.dumps(config, sort_keys=True, indent=4))
f.close()
def make_pbs(config):
bcl2fastq_command = [
PERL_PATH,
config['bcl2fastq command'],
'--input-dir {}/Data/Intensities/BaseCalls'.format(config['flowcell path']),
'--output-dir {}/Unaligned'.format(config['bcl2fastq path']),
'--sample-sheet {}'.format(config['sample sheet path']),
'--fastq-cluster-count 0',
'--no-eamss',
'--mismatches 1',
]
if config['broken']:
bcl2fastq_command.extend([
'--ignore-missing-stats',
'--ignore-missing-bcl',
'--ignore-missing-control',
])
# add parameters from config
bcl2fastq_command.append(config['configure bcl2fastq'])
content = [
'#!/bin/sh',
'#PBS -V',
'#PBS -l nodes=1:ppn={},walltime=48:00:00'.format(config['cores']),
'#PBS -M {}'.format(config['email']),
'#PBS -m abe',
'#PBS -q s48',
'#PBS -l mem={}'.format(MEMORY_REQUIREMENT),
'#PBS -e {}/log.stderr'.format(config['base path']),
'#PBS -o {}/log.stdout'.format(config['base path']),
'',
'[ -n "$(export|grep MODULEPATH)" ] && (module purge; module load bcl2fastq/gnu/1.8.4; module load fastqc/0.10.1)',
'',
'{}\n'.format(' '.join(bcl2fastq_command)),
'cd {}/Unaligned'.format(config['bcl2fastq path']),
'make -j{}; result=$?\n'.format(config['cores']),
'for x in {}/Unaligned/*/*/*.fastq.gz; do {} --noextract --threads {} --outdir {} $x; done\n'.format(
config['bcl2fastq path'],
FASTQC_PATH,
config['cores'],
config['quality control path']),
'cd {} && for x in *.zip; do unzip -qx $x && rm $x; done\n'.format(config['quality control path']),
'rsync --recursive {}/Unaligned/Basecall_Stats_* {}/'.format(config['bcl2fastq path'], config['quality control path']),
'demux --conf {} report\n'.format(config['config path']),
'cd {} && tar -cjf {}.tar.bz2 {}\n'.format(config['base path'], config['name'], config['name']),
'exit $result\n',
]
f = open(config['pbs path'], 'w')
f.write(u'\n'.join(content))
f.close()
def make_sample_sheet(config):
f = open(config['sample sheet path'], 'w')
f.write('{}\n'.format(config['sample sheet head']))
f.write(u'\n'.join(config['sample sheet']))
f.close()
def make_output_directories(config):
os.makedirs(config['bcl2fastq path'])
os.makedirs(config['quality control path'])
def list_fastq_files(config):
proc = Popen('ls {}/Unaligned/*/*/*.fastq.gz'.format(config['bcl2fastq path']), shell=True, stdout=PIPE, stderr=PIPE)
result = proc.communicate()
fastqs = result[0].splitlines()
return fastqs
def inspect_fastq_file(path):
node = None
match = FASTQ_PATTERN.search(os.path.basename(path))
if match:
node = match.groupdict()
node['path'] = path
proc = Popen('zcat {} | wc -l'.format(path), shell=True, stdout=PIPE, stderr=PIPE)
result = proc.communicate()
node['count'] = int(result[0]) / 4
return node
def print_library_report(config, report):
f = open(config['library report path'], 'w')
print(u'{},{},{},{},{}'.format('name', 'lane', 'read', 'count', 'portion'), file=f)
for lane in report['lane'].values():
for fastq in lane['library']:
print(u'{},{},{},{},{}'.format(fastq['name'], fastq['lane'], fastq['read'], fastq['count'], fastq['portion']), file=f)
f.close()
def print_lane_report(config, report):
f = open(config['lane report path'], 'w')
print (u'{},{},{},{},{},{}'.format('lane', 'count', 'undetermined count', 'undetermined portion', 'demultiplexed count', 'demultiplexed portion'), file=f)
for lane in report['lane'].values():
print (u'{},{},{},{},{},{}'.format(lane['number'], lane['count'], lane['undetermined count'], lane['undetermined portion'], lane['demultiplexed count'], lane['demultiplexed portion']), file=f)
f.close()
def compile_report(config):
report = {
'fastq':[],
'lane':{},
'count':0,
'undetermined count':0,
'undetermined portion':0.0,
'demultiplexed count':0,
'demultiplexed portion':0.0,
}
paths = list_fastq_files(config)
for path in paths:
node = inspect_fastq_file(path)
if node:
report['fastq'].append(node)
for fastq in report['fastq']:
if fastq['lane'] not in report['lane']:
report['lane'][fastq['lane']] = {
'number':fastq['lane'],
'library':[],
'count':0,
'undetermined count':0,
'undetermined portion':0.0,
'demultiplexed count':0,
'demultiplexed portion':0.0,
}
report['lane'][fastq['lane']]['library'].append(fastq)
if fastq['read'] == '1':
report['lane'][fastq['lane']]['count'] += fastq['count']
for lane in report['lane'].values():
for fastq in lane['library']:
fastq['portion'] = float(fastq['count']) / float(lane['count'])
if fastq['barcode'] == 'Undetermined' and fastq['read'] == '1':
lane['undetermined count'] = fastq['count']
lane['undetermined portion'] = fastq['portion']
lane['demultiplexed portion'] = 1.0 - fastq['portion']
for lane in report['lane'].values():
report['count'] += lane['count']
report['undetermined count'] += lane['undetermined count']
report['demultiplexed count'] += lane['demultiplexed count']
report['undetermined portion'] = float(report['undetermined count']) / float(report['count'])
report['demultiplexed portion'] = float(report['demultiplexed count']) / float(report['count'])
del report['fastq']
config['report'] = report
def print_report(config):
if 'report' not in config:
compile_report(config)
print_lane_report(config, config['report'])
print_library_report(config, config['report'])
def submit_pbs(config):
proc = Popen('qsub -N {}-Demux {}'.format(config['name'], config['pbs path']), shell=True, stdout=PIPE, stderr=PIPE)
proc.communicate()
def run(config):
proc = Popen('cat {}|bash'.format(config['pbs path']), shell=True)
proc.communicate()
def decode_cli():
env = {}
p = ArgumentParser()
p.add_argument('-c', '--conf', required=True, metavar='PATH', dest='conf', help='Path to configuration file')
# -- sub parsers for each action --
s = p.add_subparsers(dest='action')
c = s.add_parser( 'prepare', help='prepare demux',
description='prepare for demultiplexing flowcell'
)
c.add_argument('-b', '--broken', dest='broken', action='store_true', default=False,
help='handle broken runs with missing or corrupted files. \
This will add the ignore-missing-stats, ignore-missing-bcl and ignore-missing-control flags to bcl2fastq.')
c = s.add_parser( 'submit', help='submit pbs job',
description='submit a pbs job for demultiplexing the flowcell'
)
c = s.add_parser( 'run', help='execute the pbs script locally',
description='execut the pbs script in a child shell on the executing machine.'
)
c = s.add_parser( 'report', help='compile a demux report',
description='calculate read counts and distribution of libraries in lanes'
)
for k,v in vars(p.parse_args()).iteritems():
if v is not None: env[k] = v
return env
def main():
env = decode_cli()
config = load_config(env)
if env['action'] == 'prepare':
config['broken'] = env['broken']
make_output_directories(config)
make_sample_sheet(config)
make_pbs(config)
if env['action'] == 'submit':
submit_pbs(config)
if env['action'] == 'run':
run(config)
if env['action'] == 'report':
print_report(config)
save_config(env, config)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment