Skip to content

Instantly share code, notes, and snippets.

@brantfaircloth
Created June 13, 2014 23:31
Show Gist options
  • Save brantfaircloth/d69387a8fe3e8eae9860 to your computer and use it in GitHub Desktop.
Save brantfaircloth/d69387a8fe3e8eae9860 to your computer and use it in GitHub Desktop.
Merge next-seq gzip files
# this is the SampleId columnd of your SampleSheet.csv
[samples]
sample1-nameA-blah-blah
sample2-nameB-blah-blah
sample3-nameC-blah-blah
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
(c) 2014 Brant Faircloth || http://faircloth-lab.org/
All rights reserved.
This code is distributed under a 3-clause BSD license. Please see
LICENSE.txt for more information.
Created on 13 June 2014 15:59 PDT (-0700)
"""
"""
The next-seq outputs data as if it came from 4 lanes. This also
means that if you have multiplexed, now you have 4 lanes of data
to merge. This is annoying across many samples.
This code will merge those reads. Logging and some error-checking
are turned off here, as the code requires external libraries (it is
part of PHYLUCE). It should run as-is, with just the python stdlib.
See 2nd file in gist for config file example.
"""
import os
import glob
import shutil
import argparse
import ConfigParser
#from phyluce.helpers import is_file, is_dir, FullPaths, CreateDir
#from phyluce.log import setup_logging
#import pdb
def get_args():
"""Get arguments from CLI"""
parser = argparse.ArgumentParser(
description="""Given a config file and inputs, merge multiple gz files into one"""
)
parser.add_argument(
"--input",
required=True,
help="""The path to a directory containing the reads to merge."""
)
parser.add_argument(
"--config",
required=True,
#type=is_file,
#action=FullPaths,
help="""The path to the config file to use for merging."""
)
parser.add_argument(
"--output",
required=True,
#action=CreateDir,
help="""The path to a directory in which to store the output."""
)
parser.add_argument(
"--section",
type=str,
default="samples",
help="""The section holding the merge info."""
)
'''
parser.add_argument(
"--verbosity",
type=str,
choices=["INFO", "WARN", "CRITICAL"],
default="INFO",
help="""The logging level to use."""
)
parser.add_argument(
"--log-path",
action=FullPaths,
type=is_dir,
default=None,
help="""The path to a directory to hold logs."""
)
'''
parser.add_argument(
"--se",
action="store_true",
default=False,
help="""Run is single-end.""",
)
return parser.parse_args()
def main():
args = get_args()
# setup logging
# log, my_name = setup_logging(args)
# parse the config file - allowing no values (e.g. no ":" in config file)
config = ConfigParser.RawConfigParser(allow_no_value=True)
config.optionxform = str
config.read(args.config)
samples = config.items(args.section)
#pdb.set_trace()
for sample in samples:
# text = " Sample {} ".format(sample[0])
# log.info(text.center(65, "-"))
read1 = "{}_S*_L*_R1_*.fastq.gz".format(sample[0])
read1_files = sorted(glob.glob(os.path.join(args.input, read1)))
fname = os.path.basename(read1_files[0]).replace("_L001_", "_L999_")
with open(os.path.join(args.output, fname), 'wb') as outfile:
for infile in read1_files:
shutil.copyfileobj(open(infile), outfile)
'''
log.info("\tCopied {} to {}".format(
os.path.basename(infile),
fname
))
'''
if not args.se:
read2 = "{}_S*_L*_R2_*.fastq.gz".format(sample[0])
read2_files = sorted(glob.glob(os.path.join(args.input, read2)))
fname = os.path.basename(read2_files[0]).replace("_L001_", "_L999_")
with open(os.path.join(args.output, fname), 'wb') as outfile:
for infile in read2_files:
shutil.copyfileobj(open(infile), outfile)
'''
log.info("\tCopied {} to {}".format(
os.path.basename(infile),
fname
))
'''
'''
# end
text = " Completed {} ".format(my_name)
log.info(text.center(65, "="))
'''
if __name__ == '__main__':
main()
@elenamd
Copy link

elenamd commented Aug 11, 2015

I would like to try your script for merging fastq files but I'm having problems with the config file.
The SampleId of one sample is 15_7_24_144 as in basespace and this has 8 fastq files generated in Nextseq with a different name. Should the config file look like this:
[samples]
5063_S128_L001_R1_001
5063_S128_L001_R2_001
5063_S128_L002_R1_001
5063_S128_L002_R2_001
5063_S128_L003_R1_001
5063_S128_L003_R2_001
5063_S128_L004_R1_001
5063_S128_L004_R2_001

or

[samples]
15_7_24_144

Thanks for your help

@RanjeetaM
Copy link

I tried running your script to merge my sequences from Illumina machine uploaded to basespace. The script didn't run correctly. Could you suggest if it needs any further modifications. If not, could you please send the right format to run the script.

Thanks for the support.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment