Created
June 13, 2014 23:31
-
-
Save brantfaircloth/d69387a8fe3e8eae9860 to your computer and use it in GitHub Desktop.
Merge next-seq gzip files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is the SampleId columnd of your SampleSheet.csv | |
[samples] | |
sample1-nameA-blah-blah | |
sample2-nameB-blah-blah | |
sample3-nameC-blah-blah |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
(c) 2014 Brant Faircloth || http://faircloth-lab.org/ | |
All rights reserved. | |
This code is distributed under a 3-clause BSD license. Please see | |
LICENSE.txt for more information. | |
Created on 13 June 2014 15:59 PDT (-0700) | |
""" | |
""" | |
The next-seq outputs data as if it came from 4 lanes. This also | |
means that if you have multiplexed, now you have 4 lanes of data | |
to merge. This is annoying across many samples. | |
This code will merge those reads. Logging and some error-checking | |
are turned off here, as the code requires external libraries (it is | |
part of PHYLUCE). It should run as-is, with just the python stdlib. | |
See 2nd file in gist for config file example. | |
""" | |
import os | |
import glob | |
import shutil | |
import argparse | |
import ConfigParser | |
#from phyluce.helpers import is_file, is_dir, FullPaths, CreateDir | |
#from phyluce.log import setup_logging | |
#import pdb | |
def get_args(): | |
"""Get arguments from CLI""" | |
parser = argparse.ArgumentParser( | |
description="""Given a config file and inputs, merge multiple gz files into one""" | |
) | |
parser.add_argument( | |
"--input", | |
required=True, | |
help="""The path to a directory containing the reads to merge.""" | |
) | |
parser.add_argument( | |
"--config", | |
required=True, | |
#type=is_file, | |
#action=FullPaths, | |
help="""The path to the config file to use for merging.""" | |
) | |
parser.add_argument( | |
"--output", | |
required=True, | |
#action=CreateDir, | |
help="""The path to a directory in which to store the output.""" | |
) | |
parser.add_argument( | |
"--section", | |
type=str, | |
default="samples", | |
help="""The section holding the merge info.""" | |
) | |
''' | |
parser.add_argument( | |
"--verbosity", | |
type=str, | |
choices=["INFO", "WARN", "CRITICAL"], | |
default="INFO", | |
help="""The logging level to use.""" | |
) | |
parser.add_argument( | |
"--log-path", | |
action=FullPaths, | |
type=is_dir, | |
default=None, | |
help="""The path to a directory to hold logs.""" | |
) | |
''' | |
parser.add_argument( | |
"--se", | |
action="store_true", | |
default=False, | |
help="""Run is single-end.""", | |
) | |
return parser.parse_args() | |
def main(): | |
args = get_args() | |
# setup logging | |
# log, my_name = setup_logging(args) | |
# parse the config file - allowing no values (e.g. no ":" in config file) | |
config = ConfigParser.RawConfigParser(allow_no_value=True) | |
config.optionxform = str | |
config.read(args.config) | |
samples = config.items(args.section) | |
#pdb.set_trace() | |
for sample in samples: | |
# text = " Sample {} ".format(sample[0]) | |
# log.info(text.center(65, "-")) | |
read1 = "{}_S*_L*_R1_*.fastq.gz".format(sample[0]) | |
read1_files = sorted(glob.glob(os.path.join(args.input, read1))) | |
fname = os.path.basename(read1_files[0]).replace("_L001_", "_L999_") | |
with open(os.path.join(args.output, fname), 'wb') as outfile: | |
for infile in read1_files: | |
shutil.copyfileobj(open(infile), outfile) | |
''' | |
log.info("\tCopied {} to {}".format( | |
os.path.basename(infile), | |
fname | |
)) | |
''' | |
if not args.se: | |
read2 = "{}_S*_L*_R2_*.fastq.gz".format(sample[0]) | |
read2_files = sorted(glob.glob(os.path.join(args.input, read2))) | |
fname = os.path.basename(read2_files[0]).replace("_L001_", "_L999_") | |
with open(os.path.join(args.output, fname), 'wb') as outfile: | |
for infile in read2_files: | |
shutil.copyfileobj(open(infile), outfile) | |
''' | |
log.info("\tCopied {} to {}".format( | |
os.path.basename(infile), | |
fname | |
)) | |
''' | |
''' | |
# end | |
text = " Completed {} ".format(my_name) | |
log.info(text.center(65, "=")) | |
''' | |
if __name__ == '__main__': | |
main() |
I tried running your script to merge my sequences from Illumina machine uploaded to basespace. The script didn't run correctly. Could you suggest if it needs any further modifications. If not, could you please send the right format to run the script.
Thanks for the support.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I would like to try your script for merging fastq files but I'm having problems with the config file.
The SampleId of one sample is 15_7_24_144 as in basespace and this has 8 fastq files generated in Nextseq with a different name. Should the config file look like this:
[samples]
5063_S128_L001_R1_001
5063_S128_L001_R2_001
5063_S128_L002_R1_001
5063_S128_L002_R2_001
5063_S128_L003_R1_001
5063_S128_L003_R2_001
5063_S128_L004_R1_001
5063_S128_L004_R2_001
or
[samples]
15_7_24_144
Thanks for your help