guillermo-carrasco · May 13, 2013 16:13
diff --git a/simNGS_unique_ids.patch b/simNGS_unique_ids.patch
 diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c
 --- simNGS.orig/src/simlibrary.c  2013-05-13 17:45:17.000000000 +0200
 +++ simNGS/src/simlibrary.c	2013-05-13 18:06:41.000000000 +0200
 @@ -33,12 +33,13 @@
 #define QUOTE(A) Q_(A)
 #define DEFAULT_COV         0.055
 #define DEFAULT_OUT         "fasta"
 +#define DEFAULT_UNIQUE      "not"
 #define DEFAULT_INSERT      400
 #define DEFAULT_NCYCLE      45
 #define DEFAULT_COVERAGE    2.0
 #define DEFAULT_BIAS        0.5
 #define PROGNAME "simLibrary"
 -#define PROGVERSION "1.4"
 +#define PROGVERSION "1.5"

 uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){
     const uint32_t bases_per_read = paired?(2*readlen):readlen;
 @@ -71,7 +72,7 @@
 "Split sequence into a simulated library of fragments\n"
 "\n"
 "Usage:\n"
 -"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n"
 +"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen] [-u unique]\n"
 "\t           [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n"
 "\t           [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n"
 "\t" PROGNAME " --help\n"
 @@ -124,6 +125,9 @@
 "and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n"
 "http://en.wikipedia.org/wiki/FASTQ_format.\n"
 "\n"
 +"-u, --unique [default: " QUOTE(DEFAULT_UNIQUE) "]\n"
 +"\tGenerate unique identifiers for the reads, independently if the reference fasta has\n"
 +"more than one chromosome or if the input are several files.\n"
 "-g, --gel_cut lower:upper [default: no cut]\n"
 "\tStrict lower and upper boundaries for fragment length, representing a\n"
 "\"cut\" of a gel. The default is no boundaries.\n"
 @@ -191,6 +195,7 @@
     { "bias",       required_argument, NULL, 'b'},
     { "cov",        required_argument, NULL, 'c'},
     { "output",     required_argument, NULL, 'o'},
 +    { "unique",     required_argument, NULL, 'u'},
     { "gel_cut",    required_argument, NULL, 'g'},
     { "insert",     required_argument, NULL, 'i'},
     { "mutate",	    optional_argument, NULL, 3},
 @@ -213,7 +218,7 @@
     bool paired;
     uint32_t seed;
     real_t variance,cov,strand_bias,coverage;
 -    CSTRING output;
 +    CSTRING output, unique;
     enum strand_opt strand;
     FILE * multiplier_fp;
     real_t cut_lower, cut_upper;
 @@ -231,6 +236,7 @@
     opt->cov = DEFAULT_COV;
     opt->coverage = DEFAULT_COVERAGE;
     opt->output = DEFAULT_OUT;
 +    opt->unique = DEFAULT_UNIQUE;
     opt->nfragment = 0;
     opt->paired = true;
     opt->strand_bias = DEFAULT_BIAS;
 @@ -276,7 +282,7 @@
     OPT opt = new_OPT();
     validate(NULL!=opt,NULL);

 -    while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
 +    while ((ch = getopt_long(argc, argv, "b:c:o:u:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
         switch(ch){
         case 'b':
             opt->strand_bias = parse_real(optarg);
 @@ -290,6 +296,8 @@
                 if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;}
                 if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;}
                 errx(EXIT_FAILURE,"Unrecognised choice \"%s\" for --output format",optarg); break;
 +        case 'u':
 +                if(!strncasecmp(optarg,"unique",6)){ opt->unique = "unique"; break;}
 	case 'g':
 	    ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper);
 	    if(ret<1){
 @@ -421,6 +429,9 @@
                 warnx("Failed to open file \"%s\" for input",argv[0]);
             }
         }
 +        //Keep a counter with the number of chromosomes (or files)
 +        //to get unique read IDs
 +        uint32_t chromosomes = 0;
         while (NULL!=fp && (seq=sequence_from_fasta(fp))!=NULL){
             // Read multiplier from file, if available
             double multiplier = 1.;
 @@ -440,6 +451,8 @@

             // Number of fragments
             uint32_t nfragment = multiplier * ( (opt->nfragment)?opt->nfragment:nfragment_from_coverage(seq->length,opt->coverage,opt->ncycle,opt->paired) );
 +            uint32_t seq_id = 0;
 +            if(!strcmp(opt->unique, "unique")) seq_id = chromosomes*nfragment;
             for ( uint32_t i=0 ; i<nfragment ; i++,tot_fragments++){
                 const uint32_t fraglen = (opt->paired)?(uint32_t)(rlognorm_with_cuts(log_mean,log_sd,opt->cut_lower,opt->cut_upper)):opt->ncycle;
                 if(fraglen>seq->length){
 @@ -458,7 +471,7 @@
 		SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false);
 		free_SEQ(mutseq);

 -                CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output);
 +                CSTRING sampname = fragname(seq->name,seq_id+i+1,strand,loc,fraglen, opt->output);

                 free_CSTRING(sampseq->name);
                 sampseq->name = sampname;
 @@ -467,6 +480,7 @@
                 if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); }
             }
             free_SEQ(seq);
 +            chromosomes++;
         }
         fclose(fp);
         argc--;
	diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c
	--- simNGS.orig/src/simlibrary.c 2013-05-13 17:45:17.000000000 +0200
	+++ simNGS/src/simlibrary.c 2013-05-13 18:06:41.000000000 +0200
	@@ -33,12 +33,13 @@
	#define QUOTE(A) Q_(A)
	#define DEFAULT_COV 0.055
	#define DEFAULT_OUT "fasta"
	+#define DEFAULT_UNIQUE "not"
	#define DEFAULT_INSERT 400
	#define DEFAULT_NCYCLE 45
	#define DEFAULT_COVERAGE 2.0
	#define DEFAULT_BIAS 0.5
	#define PROGNAME "simLibrary"
	-#define PROGVERSION "1.4"
	+#define PROGVERSION "1.5"

	uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){
	const uint32_t bases_per_read = paired?(2*readlen):readlen;
	@@ -71,7 +72,7 @@
	"Split sequence into a simulated library of fragments\n"
	"\n"
	"Usage:\n"
	-"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n"
	+"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen] [-u unique]\n"
	"\t [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n"
	"\t [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n"
	"\t" PROGNAME " --help\n"
	@@ -124,6 +125,9 @@
	"and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n"
	"http://en.wikipedia.org/wiki/FASTQ_format.\n"
	"\n"
	+"-u, --unique [default: " QUOTE(DEFAULT_UNIQUE) "]\n"
	+"\tGenerate unique identifiers for the reads, independently if the reference fasta has\n"
	+"more than one chromosome or if the input are several files.\n"
	"-g, --gel_cut lower:upper [default: no cut]\n"
	"\tStrict lower and upper boundaries for fragment length, representing a\n"
	"\"cut\" of a gel. The default is no boundaries.\n"
	@@ -191,6 +195,7 @@
	{ "bias", required_argument, NULL, 'b'},
	{ "cov", required_argument, NULL, 'c'},
	{ "output", required_argument, NULL, 'o'},
	+ { "unique", required_argument, NULL, 'u'},
	{ "gel_cut", required_argument, NULL, 'g'},
	{ "insert", required_argument, NULL, 'i'},
	{ "mutate", optional_argument, NULL, 3},
	@@ -213,7 +218,7 @@
	bool paired;
	uint32_t seed;
	real_t variance,cov,strand_bias,coverage;
	- CSTRING output;
	+ CSTRING output, unique;
	enum strand_opt strand;
	FILE * multiplier_fp;
	real_t cut_lower, cut_upper;
	@@ -231,6 +236,7 @@
	opt->cov = DEFAULT_COV;
	opt->coverage = DEFAULT_COVERAGE;
	opt->output = DEFAULT_OUT;
	+ opt->unique = DEFAULT_UNIQUE;
	opt->nfragment = 0;
	opt->paired = true;
	opt->strand_bias = DEFAULT_BIAS;
	@@ -276,7 +282,7 @@
	OPT opt = new_OPT();
	validate(NULL!=opt,NULL);

	- while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
	+ while ((ch = getopt_long(argc, argv, "b:c:o:u:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
	switch(ch){
	case 'b':
	opt->strand_bias = parse_real(optarg);
	@@ -290,6 +296,8 @@
	if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;}
	if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;}
	errx(EXIT_FAILURE,"Unrecognised choice \"%s\" for --output format",optarg); break;
	+ case 'u':
	+ if(!strncasecmp(optarg,"unique",6)){ opt->unique = "unique"; break;}
	case 'g':
	ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper);
	if(ret<1){
	@@ -421,6 +429,9 @@
	warnx("Failed to open file \"%s\" for input",argv[0]);
	}
	}
	+ //Keep a counter with the number of chromosomes (or files)
	+ //to get unique read IDs
	+ uint32_t chromosomes = 0;
	while (NULL!=fp && (seq=sequence_from_fasta(fp))!=NULL){
	// Read multiplier from file, if available
	double multiplier = 1.;
	@@ -440,6 +451,8 @@

	// Number of fragments
	uint32_t nfragment = multiplier * ( (opt->nfragment)?opt->nfragment:nfragment_from_coverage(seq->length,opt->coverage,opt->ncycle,opt->paired) );
	+ uint32_t seq_id = 0;
	+ if(!strcmp(opt->unique, "unique")) seq_id = chromosomes*nfragment;
	for ( uint32_t i=0 ; i<nfragment ; i++,tot_fragments++){
	const uint32_t fraglen = (opt->paired)?(uint32_t)(rlognorm_with_cuts(log_mean,log_sd,opt->cut_lower,opt->cut_upper)):opt->ncycle;
	if(fraglen>seq->length){
	@@ -458,7 +471,7 @@
	SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false);
	free_SEQ(mutseq);

	- CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output);
	+ CSTRING sampname = fragname(seq->name,seq_id+i+1,strand,loc,fraglen, opt->output);

	free_CSTRING(sampseq->name);
	sampseq->name = sampname;
	@@ -467,6 +480,7 @@
	if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); }
	}
	free_SEQ(seq);
	+ chromosomes++;
	}
	fclose(fp);
	argc--;
No results found