Skip to content

Instantly share code, notes, and snippets.

@wckdouglas
Created August 6, 2015 15:59
Show Gist options
  • Save wckdouglas/052bd7c986fd65b3673c to your computer and use it in GitHub Desktop.
Save wckdouglas/052bd7c986fd65b3673c to your computer and use it in GitHub Desktop.
#include <string.h>
#include <cstring>
#include <iostream>
#include <fstream>
#include <gzstream.h>
#include <sstream>
using namespace std;
//get the id hash table and
//iterate over the fastq file
// determine which sequence to print out
string fixfilenum(int filenum)
{
string out;
ostringstream convert;
if (filenum > 10)
{
convert << filenum;
out = convert.str();
}
else
{
convert << filenum;
out = "0" + convert.str();
}
return out;
}
void splitFastq(char *fqFile, string filePrefix, int recordNum)
{
// open fastq file for kseq parsing
cerr << "From " << fqFile << "...." << endl;
cerr << "Splitting " << recordNum << " records per file" << endl;
int maxLine = recordNum * 4;
int lineCount = 0, filenum = 0;
string filename;
igzstream in(fqFile);
ofstream outFile;
for (string line; getline(in,line);)
{
if (lineCount == 0)
{
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq";
outFile.open(filename.c_str());
outFile << line << '\n';
}
else if (lineCount == maxLine)
{
outFile.close();
cerr << "written " << filename << endl;
lineCount = 0;
filenum ++;
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq";
outFile.open(filename.c_str());
outFile << line << '\n';
}
else
{
outFile << line << '\n';
}
lineCount ++;
}
outFile.close();
cerr << "written " << filename << endl;
}
void splitFastqZip(char *fqFile, string filePrefix, int recordNum)
{
// open fastq file for kseq parsing
cerr << "From " << fqFile << "...." << endl;
cerr << "Splitting " << recordNum << " records per file" << endl;
int maxLine = recordNum * 4;
int lineCount = 0, filenum = 0;
string filename;
igzstream in(fqFile);
ogzstream outFile;
for (string line; getline(in,line);)
{
if (lineCount == 0)
{
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz";
outFile.open(filename.c_str());
outFile << line << '\n';
}
else if (lineCount == maxLine)
{
outFile.close();
cerr << "written " << filename << endl;
lineCount = 0;
filenum ++;
filename = filePrefix + "_" + fixfilenum(filenum) + ".fastq.gz";
outFile.open(filename.c_str());
outFile << line << '\n';
}
else
{
outFile << line << '\n';
}
lineCount ++;
}
outFile.close();
cerr << "written " << filename << endl;
}
// print usage
void usage(string programname)
{
cerr << "usage: "<< programname << " -i <fqfile> -n <# of record per file> -o <prefix> [-z]" << endl;
cerr << "[options]" << endl;
cerr << "-i <fastq file>" << endl;
cerr << "-n <number of record in each splitted file> default: 10000000" << endl;
cerr << "-o <prefix>" << endl;
cerr << "-z optional: gzip output" << endl;
}
// main function
int main(int argc, char **argv){
char *fqFile;
int c, recordNum = 10000000;
int gz = 0;
string programname = argv[0];
string filePrefix = "";
if (argc == 1){
usage(programname);
return 1;
}
opterr = 0;
// print usage if not enough argumnets
while ((c = getopt(argc, argv, "i:n:o:z")) != -1){
switch (c){
case 'i':
fqFile = optarg;
break;
case 'n':
recordNum = atoi(optarg);
break;
case 'o':
filePrefix = optarg;
break;
case 'z':
gz = 1;
break;
case '?':
if (optopt == 'n' || optopt == 'i' || optopt== 'o'){
cerr << "option n, i, p need arguments!" << endl;
usage(programname);
}
else {
usage(programname);
}
return 1;
default:
abort();
}
}
if (filePrefix == "" || strcmp(fqFile,"") == 0)
{
usage(programname);
return 1;
}
// pass variable to fnuction
if (gz == 0)
{
splitFastq(fqFile, filePrefix, recordNum);
}
else
{
splitFastqZip(fqFile, filePrefix, recordNum);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment