Created
May 13, 2013 16:13
-
-
Save guillermo-carrasco/5569506 to your computer and use it in GitHub Desktop.
Unique read IDs for simLibrary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c | |
--- simNGS.orig/src/simlibrary.c 2013-05-13 17:45:17.000000000 +0200 | |
+++ simNGS/src/simlibrary.c 2013-05-13 18:06:41.000000000 +0200 | |
@@ -33,12 +33,13 @@ | |
#define QUOTE(A) Q_(A) | |
#define DEFAULT_COV 0.055 | |
#define DEFAULT_OUT "fasta" | |
+#define DEFAULT_UNIQUE "not" | |
#define DEFAULT_INSERT 400 | |
#define DEFAULT_NCYCLE 45 | |
#define DEFAULT_COVERAGE 2.0 | |
#define DEFAULT_BIAS 0.5 | |
#define PROGNAME "simLibrary" | |
-#define PROGVERSION "1.4" | |
+#define PROGVERSION "1.5" | |
uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){ | |
const uint32_t bases_per_read = paired?(2*readlen):readlen; | |
@@ -71,7 +72,7 @@ | |
"Split sequence into a simulated library of fragments\n" | |
"\n" | |
"Usage:\n" | |
-"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n" | |
+"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen] [-u unique]\n" | |
"\t [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n" | |
"\t [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n" | |
"\t" PROGNAME " --help\n" | |
@@ -124,6 +125,9 @@ | |
"and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n" | |
"http://en.wikipedia.org/wiki/FASTQ_format.\n" | |
"\n" | |
+"-u, --unique [default: " QUOTE(DEFAULT_UNIQUE) "]\n" | |
+"\tGenerate unique identifiers for the reads, independently if the reference fasta has\n" | |
+"more than one chromosome or if the input are several files.\n" | |
"-g, --gel_cut lower:upper [default: no cut]\n" | |
"\tStrict lower and upper boundaries for fragment length, representing a\n" | |
"\"cut\" of a gel. The default is no boundaries.\n" | |
@@ -191,6 +195,7 @@ | |
{ "bias", required_argument, NULL, 'b'}, | |
{ "cov", required_argument, NULL, 'c'}, | |
{ "output", required_argument, NULL, 'o'}, | |
+ { "unique", required_argument, NULL, 'u'}, | |
{ "gel_cut", required_argument, NULL, 'g'}, | |
{ "insert", required_argument, NULL, 'i'}, | |
{ "mutate", optional_argument, NULL, 3}, | |
@@ -213,7 +218,7 @@ | |
bool paired; | |
uint32_t seed; | |
real_t variance,cov,strand_bias,coverage; | |
- CSTRING output; | |
+ CSTRING output, unique; | |
enum strand_opt strand; | |
FILE * multiplier_fp; | |
real_t cut_lower, cut_upper; | |
@@ -231,6 +236,7 @@ | |
opt->cov = DEFAULT_COV; | |
opt->coverage = DEFAULT_COVERAGE; | |
opt->output = DEFAULT_OUT; | |
+ opt->unique = DEFAULT_UNIQUE; | |
opt->nfragment = 0; | |
opt->paired = true; | |
opt->strand_bias = DEFAULT_BIAS; | |
@@ -276,7 +282,7 @@ | |
OPT opt = new_OPT(); | |
validate(NULL!=opt,NULL); | |
- while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){ | |
+ while ((ch = getopt_long(argc, argv, "b:c:o:u:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){ | |
switch(ch){ | |
case 'b': | |
opt->strand_bias = parse_real(optarg); | |
@@ -290,6 +296,8 @@ | |
if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;} | |
if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;} | |
errx(EXIT_FAILURE,"Unrecognised choice \"%s\" for --output format",optarg); break; | |
+ case 'u': | |
+ if(!strncasecmp(optarg,"unique",6)){ opt->unique = "unique"; break;} | |
case 'g': | |
ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper); | |
if(ret<1){ | |
@@ -421,6 +429,9 @@ | |
warnx("Failed to open file \"%s\" for input",argv[0]); | |
} | |
} | |
+ //Keep a counter with the number of chromosomes (or files) | |
+ //to get unique read IDs | |
+ uint32_t chromosomes = 0; | |
while (NULL!=fp && (seq=sequence_from_fasta(fp))!=NULL){ | |
// Read multiplier from file, if available | |
double multiplier = 1.; | |
@@ -440,6 +451,8 @@ | |
// Number of fragments | |
uint32_t nfragment = multiplier * ( (opt->nfragment)?opt->nfragment:nfragment_from_coverage(seq->length,opt->coverage,opt->ncycle,opt->paired) ); | |
+ uint32_t seq_id = 0; | |
+ if(!strcmp(opt->unique, "unique")) seq_id = chromosomes*nfragment; | |
for ( uint32_t i=0 ; i<nfragment ; i++,tot_fragments++){ | |
const uint32_t fraglen = (opt->paired)?(uint32_t)(rlognorm_with_cuts(log_mean,log_sd,opt->cut_lower,opt->cut_upper)):opt->ncycle; | |
if(fraglen>seq->length){ | |
@@ -458,7 +471,7 @@ | |
SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false); | |
free_SEQ(mutseq); | |
- CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output); | |
+ CSTRING sampname = fragname(seq->name,seq_id+i+1,strand,loc,fraglen, opt->output); | |
free_CSTRING(sampseq->name); | |
sampseq->name = sampname; | |
@@ -467,6 +480,7 @@ | |
if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); } | |
} | |
free_SEQ(seq); | |
+ chromosomes++; | |
} | |
fclose(fp); | |
argc--; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment