jxtx · October 14, 2020 17:29
diff --git a/blast.c b/blast.c
 /*
 * BLAST - Search two DNA sequences for locally maximal segment pairs. The basic
 * command syntax is
 *
 * 	BLAST sequence1 sequence2
 *
 * where sequence1 and sequence2 name files containing DNA sequences.  Lines
 * at the beginnings of the files that don't start with 'A', 'C', 'T' or 'G'
 * are discarded.  Thus a typical sequence file might begin:
 *
 *	>BOVHBPP3I  Bovine beta-globin psi-3 pseudogene, 5' end.
 *	GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
 *	GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
 *
 * If sequence1 and sequence2 are identical file names, then matches are
 * computed without "mirror image" pairs and without the trivial long match.
 *
 * The BLAST command permits optional additional arguments, such as X=50,
 * that reset certain internal parameters.  The available parameters are:
 *
 *	M or m gives the score for a match.
 *	I or i gives the score for a transition (A <--> G, C <--> T).
 *	V or v gives the score for a tranversion (non-transition change).
 *	W or w gives the word size.
 *	X or x gives the value for terminating word extensions.
 *	K or k give the cutoff.
 *	(defaults: M = 1, I = -1, V = -1, W = 8, X = 10*M,
 *	 and  K = 5% significance level)
 *
 * If W is set to 0, then all locally maximum segment pairs (LMSPs) are
 * computed; in this case the value of X is irrelevant.
 *
 * In addition, the word "noalign" requests that no alignments be printed
 * (summary statistics for each locally maximal segment pair are reported)
 * and the word "stats" requests that statistics concerning the performance
 * of BLAST be printed.  Thus a typical command is
 *
 *	BLAST SEQ1 SEQ2 M=1 I=0 V=-1 noalign
 */
 #include <stdio.h>
 #include <math.h>
 #include <ctype.h>

 #define max(x,y) ((x > y) ? (x) : (y))
 #define min(x,y) ((x < y) ? (x) : (y))
 #define SUBSTITUTION_SCORE(x,y) S[x][y]

 #define DEFAULT_M	1	/* default score for match */
 #define DEFAULT_I	-1	/* default score for transition */
 #define DEFAULT_V	-1	/* default score for transversion */
 #define DEFAULT_W	8	/* default word size */
 #define DEFAULT_SIG	0.05	/* default significance level for setting K */

 char
 	S[128][128],
 	*s1, *seq1, *s2, *seq2;
 int
 	alignments,
 	print_stats,
 	K,		/* cutoff score */
 	M,		/* score for a match */
 	I,		/* score for a transition */
 	V,		/* score for a transversion */
 	W,		/* word length */
 	X,		/* cutoff for hit extensions */
 	sig99, sig90, sig50,	/* number of MPSs above given significance */
 	numMSPs,	/* total number of MSPs recorded */
 	numhits,	/* number of word hits */
 	missW[15][3],
 	missX[25][3];

 double
 	param_K, param_lambda;

 long
 	*diag_lev,	/* extent of discovered matches on a given diagonal */
 	len1, len2;	/* sequence lengths */


 typedef struct msp {
 	long len, pos1, pos2;
 	int score;
 	struct msp *next_msp;
 	} Msp, *Msp_ptr;

 Msp_ptr msp_list, *msp;

 main(argc, argv)
 int argc;
 char *argv[];
 {
 	int i, v;
 	double x, log(), sqrt(), significance();
 	char *ckalloc();

 	if (argc < 3)
 		fatalf("%s seq1 seq2 [M=] [I=] [V=] [K=] [W=] [X=] [noalign] [stats]",
 			argv[0]);
 	M = DEFAULT_M;
 	I = DEFAULT_I;
 	V = DEFAULT_V;
 	W = DEFAULT_W;
 	X = -1;
 	alignments = 1;
 	print_stats = 0;
 	while (--argc > 2) {
 		if (strcmp(argv[argc],"noalign") == 0) {
 			alignments = 0;
 			continue;
 		}
 		if (strcmp(argv[argc],"stats") == 0) {
 			print_stats = 1;
 			continue;
 		}
 		if (argv[argc][1] != '=')
 			fatalf("argument %d has improper form", argc);
 		v = atoi(argv[argc] + 2);
 		switch (argv[argc][0]) {
 			case 'M':
 			case 'm':
 				M = v;
 				break;
 			case 'I':
 			case 'i':
 				I = v;
 				break;
 			case 'V':
 			case 'v':
 				V = v;
 				break;
 			case 'K':
 			case 'k':
 				K = v;
 				break;
 			case 'W':
 			case 'w':
 				W = v;
 				break;
 			case 'X':
 			case 'x':
 				X = v;
 				break;
 			default:
 				fatal("Options are M, I, V, K, W, X.");
 		}
 	}

 	if (X == -1)
 		X = 10*M;
 	if (V > I)
 		fatal("transversions score higher than transitions?");

 	len1 = get_seq(argv[1], &seq1);
 	if (strcmp(argv[1],argv[2]) == 0) {
 		if (W == 0)
 			fatal("file1 = file2 not implemented when W=0");
 		seq2 = seq1;
 		len2 = len1;
 	} else
 		len2 = get_seq(argv[2], &seq2);
 	s1 = seq1 - 1;	/* so subscripts start with 1 */
 	s2 = seq2 - 1;
 	diag_lev = (long *)ckalloc((len1+len2+1)*sizeof(long)) + len1;

 	/* set up scoring matrix */
 	S['A']['A'] = S['C']['C'] = S['G']['G'] = S['T']['T'] = M;
 	S['A']['G'] = S['G']['A'] = S['C']['T'] = S['T']['C'] = I;
 	S['A']['C'] = S['A']['T'] = S['C']['A'] = S['C']['G'] =
 		S['G']['C'] = S['G']['T'] = S['T']['A'] = S['T']['G'] = V;

 	get_params();
 	if (K == 0) {
 		x = log(DEFAULT_SIG)/(-param_K*len1*len2);
 		K = 2*log(sqrt(x))/(-param_lambda);
 		while (significance(K-1) >= DEFAULT_SIG)
 			--K;
 		while (significance(K) < DEFAULT_SIG)
 			++K;
 	}

 	if (alignments) {
 		printf("#:lav\n\n");
 		printf("d {\n\"BLAST output with parameters:\n");
 		printf("M = %d, I = %d, V = %d\n", M, I, V);
 		if (W > 0)
 			printf("W = %d, X = %d\"", W, X);
 		else
 			printf("W = 0 (computes all LMSPs)\"");
 		printf("\n}\n");
 		printf("s {\n  \"%s\" 1 %d\n  \"%s\" 1 %d\n}\n",
 			argv[1], len1, argv[2], len2);
 		printf("k {\n  \"significance\"\n}\n");
 	}

 	if (W > 0) {
 		bld_table();
 		search();
 	} else
 		get_msps();

 	sort_msps();

 	if (!alignments)
 		printf("\n\n      Seq. 1  Seq. 2   len  score  signif    W    X\n");
 	/* print the matches for each sequence */
 	for (i = 0; i < numMSPs; ++i)
 		print_msp(i);

 	if (W == 0 && print_stats) {
 		printf("\n%3d MSPs above significance level 0.99\n", sig99);
 		printf("%3d MSPs above significance level 0.90\n", sig90);
 		printf("%3d MSPs above significance level 0.50\n", sig50);
 		printf("\nPercent of MSPs missed by BLAST at various");
 		printf(" W and significance levels:\n");
 		printf(" W:   0.99   0.90   0.50\n");
 		for (i = 4; i <= 12; ++i)
 			printf("%2d:  %4.1f%%  %4.1f%%  %4.1f%%\n",
 				i,
 				100*missW[i][0]/((float)sig99),
 				100*missW[i][1]/((float)sig90),
 				100*missW[i][2]/((float)sig50));
 		printf("\nPercent of MSPs missed by BLAST at various");
 		printf(" X and significance levels:\n");
 		printf(" X:   0.99   0.90   0.50\n");
 		for (i = 1; i <= 20; ++i)
 			printf("%2d:  %4.1f%%  %4.1f%%  %4.1f%%\n",
 				i,
 				100*missX[i][0]/((float)sig99),
 				100*missX[i][1]/((float)sig90),
 				100*missX[i][2]/((float)sig50));
 	}
 	if (W > 0 && print_stats) {
 		printf("\n\nStatistics:\n");
 		printf("  %s: M = %d, I = %d, V = %d, K = %d, W = %d, X = %d\n",
 			argv[0], M, I, V, K, W, X);
 		printf("  # of word hits = %d\n", numhits);
 		printf("  # of matches found = %d\n", numMSPs);
 	}

 	if (strcmp(argv[1], argv[2]) == 0) {
 		/* self-comparison; insert trivial diagonal */
 		printf("a {\n  s 0.0\n  b 1 1\n  e %d %d\n", len1, len1);
 		printf("  l 1 1 %d %d 0.0\n}\n", len1, len1);
 	}

 	exit (0);
 }

 /* get_seq - read a sequence */
 int get_seq(file_name, seqptr)
 char *file_name, **seqptr;
 {
 	FILE *qp, *ckopen();
 	char *p, *fgets(), *strchr(), *ckalloc();
 	int c;
 	long n;
 		
 	qp = ckopen(file_name, "r");
 	for (n = 0; (c = getc(qp)) != EOF; )
 		if (c != '\n')
 			++n;
 	*seqptr = ckalloc(n+1);
 	rewind(qp);
 	if (n == 0)
 		return 0;
 	for (p = *seqptr; ; ) {
 		if (fgets(p, (int)n+1, qp) == NULL) {
 			fclose(qp);
 			*p = '\0';
 			break;
 		}
 		if (p == *seqptr && strchr("ACTG", *p) == NULL)
 			continue;
 		while (isupper(*p))
 			++p;
 		if (*p != '\n' && *p != '\0')
 			fatalf("Illegal character %c in query sequence.", *p);
 	}
 	return p - *seqptr;
 }

 /* -------------   build table of W-tuples in the first sequence ------------*/

 /* The data structure could be simplified if we limited W to, say, W <= 8. */

 struct hash_node {
 	long ecode;		/* integer encoding of the word */
 	int pos;		/* positions where word hits query sequence */
 	struct hash_node *link;	/* next word with same last 7.5 letters */
 };

 #define HASH_SIZE 32767		/* 2**15 - 1 */

 struct hash_node *hashtab[HASH_SIZE+1];
 int encoding[128];
 int mask;
 long *next_pos;

 bld_table()
 {
 	long ecode;
 	int i;
 	char *q, *ckalloc();

 	/* perform initializations */
 	encoding['C'] = 1;
 	encoding['G'] = 2;
 	encoding['T'] = 3;
 	mask = (1 << (W+W-2)) - 1;
 	next_pos = (long *)ckalloc(len1*sizeof(long));

 	q = seq1 - 1;
 	ecode = 0L;
 	for (i = 1; i < W; ++i)
 		ecode = (ecode << 2) + encoding[*++q];
 	while (*++q) {
 		ecode = ((ecode & mask) << 2) + encoding[*q];
 		add_word(ecode, (long)(q - seq1 +1));
 	}
 }

 /* add_word - add a word to the table of critical words */
 add_word(ecode, pos)
 long ecode, pos;
 {
 	struct hash_node *h;
 	long hval;
 	char *ckalloc();

 	hval = ecode & HASH_SIZE;
 	for (h = hashtab[hval]; h; h = h->link)
 		if (h->ecode == ecode)
 			break;
 	if (h == NULL) {
 		h = (struct hash_node *) ckalloc (sizeof(struct hash_node));
 		h->link = hashtab[hval];
 		hashtab[hval] = h;
 		h->ecode = ecode;
 		h->pos = -1;
 	}
 	next_pos[pos] = h->pos;
 	h->pos = pos;
 }

 /* -----------------------   search the second sequence   --------------------*/

 search()
 {
 	register struct hash_node *h;
 	register char *s;
 	register long ecode, hval;
 	int i;
 	long p;

 	s = seq2 - 1;
 	ecode = 0L;
 	for (i = 1; i < W; ++i)
 		ecode = (ecode << 2) + encoding[*++s];
 	while (*++s) {
 		ecode = ((ecode & mask) << 2) + encoding[*s];
 		hval = ecode & HASH_SIZE;
 		for (h = hashtab[hval]; h; h = h->link)
 			if (h->ecode == ecode) {
 				for (p = h->pos; p >= 0; p = next_pos[p])
 					extend_hit(p, (long)(s+1-seq2));
 				break;
 			}
 	}
 }

 /* extend_hit - extend a word-sized hit to a longer match */
 extend_hit(pos1, pos2)
 long pos1, pos2;
 {
 	char *beg2, *beg1, *end1, *q, *s;
 	long min_sum, max_sum, sum, diag, score;
 	Msp_ptr mp;

 	if (seq1 == seq2 && pos1 >= pos2)
 		return;

 	numhits++;

 	diag = pos2 - pos1;
 	if (diag_lev[diag] > pos1)
 		return;

 	/* extend to the right */
 	max_sum = sum = 0;
 	end1 = q = seq1 + pos1;
 	s = seq2 + pos2;
 	while (*s != '\0' && *q != '\0' && sum >= max_sum - X) {
 		sum += SUBSTITUTION_SCORE(*s++,*q++);
 		if (sum > max_sum) {
 			max_sum = sum;
 			end1 = q;
 		}
 	}

 	/* extend to the left */
 	min_sum = sum = 0;
 	beg1 = q = (seq1 + pos1) - W;
 	beg2 = s = seq2 + pos2 - W;
 	while (s > seq2 && q > seq1 && sum >= min_sum - X) {
 		sum += SUBSTITUTION_SCORE(*--s,*--q);
 		if (sum > min_sum) {
 			min_sum = sum;
 			beg2 = s;
 			beg1 = q;
 		}
 	}

 	score = M*W + max_sum + min_sum;

 	if (score >= K) {
 		++numMSPs;
 		mp = (Msp_ptr)ckalloc(sizeof(Msp));
 		mp->len = end1 - beg1;
 		mp->pos1 = beg1 - seq1;
 		mp->pos2 = beg2 - seq2;
 		mp->score = score;
 		mp->next_msp = msp_list;
 		msp_list = mp;
 	}

 	diag_lev[diag] = (end1 - seq1) + W;
 }

 /* ----------------   compute locally maximal sequence pairs  ---------------*/

 get_msps()
 {
 	long d, i, score, best_score, end1, beg1, lower, upper;

 	for (d = 1-len1; d < len2; ++d) {
 		beg1 = best_score = score = 0;
 		lower = max(1, 1-d);
 		upper = min(len1, len2-d);
 		for (i = lower; i <= upper; ++i)
 			if (score > 0) {
 				score += SUBSTITUTION_SCORE(s1[i], s2[i+d]);
 				if (score > best_score) {
 					best_score = score;
 					end1 = i;
 				}
 			} else {
 				check_score(best_score, d, beg1, end1);
 				best_score = score =
 					SUBSTITUTION_SCORE(s1[i], s2[i+d]);
 				beg1 = end1 = i;
 			}
 		check_score(best_score, d, beg1, end1);
 	}
 }

 check_score(score, d, beg1, end1)
 long score, d, beg1, end1;
 {
 	char *ckalloc();
 	Msp_ptr mp;

 	if (score >= K) {
 		++numMSPs;
 		mp = (Msp_ptr)ckalloc(sizeof(Msp));
 		mp->len = end1 - beg1 + 1;
 		mp->pos1 = beg1 - 1;
 		mp->pos2 = beg1 + d - 1;
 		mp->score = score;
 		mp->next_msp = msp_list;
 		msp_list = mp;
 	}
 }


 /* sort_msps - order database sequence for printing */
 sort_msps()
 {
 	char *ckalloc();
 	int i;
 	Msp_ptr mp;

 	msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr));
 	for (i = 0, mp = msp_list; i < numMSPs; ++i, mp = mp->next_msp)
 		msp[i] = mp;
 	for (i = (numMSPs/2) - 1; i >= 0; --i)
 		heapify(i, (int) numMSPs-1);
 	for (i = numMSPs-1; i > 0; --i) {
 		mp = msp[0];
 		msp[0] = msp[i];
 		msp[i] = mp;
 		if (i > 1)
 			heapify(0, i-1);
 	}
 }

 /* heapify - core procedure for heapsort */
 heapify(i, last)
 int i, last;
 {
 	int lim = (last-1)/2, left_son, small_son;
 	Msp_ptr mp;

 	while (i <= lim) {
 		left_son = 2*i + 1;
 		if (left_son == last)
 			small_son = left_son;
 		else
 			small_son = smaller(left_son, left_son+1);
 		if (smaller(i, small_son) == small_son) {
 			mp = msp[i];
 			msp[i] = msp[small_son];
 			msp[small_son] = mp;
 			i = small_son;
 		} else
 			break;
 	}
 }

 /* smaller - determine which of two sequences should be printed first */
 int smaller(i, j)
 int i, j;
 {
 	Msp_ptr ki = msp[i], kj = msp[j];

 	if (ki->score == kj->score)
 		return ((ki->pos1 >= kj->pos1) ? i : j);
 	/* print one with larger score first */
 	return ((ki->score <= kj->score) ? i : j);
 }

 /* --------------------------  compute significance parameters ---------------*/

 get_params()
 {
 	double _p[1000], *p , log(), sqrt(), pMatch, pTransit, pTransver, N;
 	long A1, A2, C1, C2, G1, G2, T1, T2;
 	int c, i;

 	A1 = A2 = C1 = C2 = G1 = G2 = T1 = T2 = 0;

 	for (i = 0; i < len1; ++i)
 		if ((c = seq1[i]) == 'A')
 			++A1;
 		else if (c == 'C')
 			++C1;
 		else if (c == 'G')
 			++G1;
 		else if (c == 'T')
 			++T1;
 	for (i = 0; i < len2; ++i)
 		if ((c = seq2[i]) == 'A')
 			++A2;
 		else if (c == 'C')
 			++C2;
 		else if (c == 'G')
 			++G2;
 		else if (c == 'T')
 			++T2;
 	N = ((double)len1)*((double)len2);

 	pMatch = ((double)A1)*((double)A2)
 	       + ((double)C1)*((double)C2)
 	       + ((double)G1)*((double)G2)
 	       + ((double)T1)*((double)T2);
 	pTransit = ((double)A1)*((double)G2)
 	       + ((double)C1)*((double)T2)
 	       + ((double)G1)*((double)A2)
 	       + ((double)T1)*((double)C2);
 	pTransver = N - pMatch - pTransit;

 	for (i = 0; i < V+M; ++i)
 		_p[i] = 0.0;
 	p = _p - V;
 	p[V] = pTransver/N;
 	p[I] += (pTransit/N);
 	p[M] += (pMatch/N);

 	if (karlin(V, M, _p, &param_lambda, &param_K) == 0)
 		fatal("compution of significance levels failed");
 }
 	
 /**************** Statistical Significance Parameter Subroutine ****************

 	Version 1.0	February 2, 1990
 	Version 1.1	July 5,     1990

 	Program by:	Stephen Altschul

 	Address:	National Center for Biotechnology Information
 			National Library of Medicine
 			National Institutes of Health
 			Bethesda, MD  20894

 	Internet:	[email protected]

 See:	Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
 	Significance of Molecular Sequence Features by Using General Scoring
 	Schemes,"  Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.

 	Computes the parameters lambda and K for use in calculating the
 	statistical significance of high-scoring segments or subalignments.

 	The scoring scheme must be integer valued.  A positive score must be
 	possible, but the expected (mean) score must be negative.

 	A program that calls this routine must provide the value of the lowest
 	possible score, the value of the greatest possible score, and a pointer
 	to an array of probabilities for the occurence of all scores between
 	these two extreme scores.  For example, if score -2 occurs with
 	probability 0.7, score 0 occurs with probability 0.1, and score 3
 	occurs with probability 0.2, then the subroutine must be called with
 	low = -2, high = 3, and pr pointing to the array of values
 	{ 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }.  The calling program must also provide
 	pointers to lambda and K; the subroutine will then calculate the values
 	of these two parameters.  In this example, lambda=0.330 and K=0.154.

 	The parameters lambda and K can be used as follows.  Suppose we are
 	given a length N random sequence of independent letters.  Associated
 	with each letter is a score, and the probabilities of the letters
 	determine the probability for each score.  Let S be the aggregate score
 	of the highest scoring contiguous segment of this sequence.  Then if N
 	is sufficiently large (greater than 100), the following bound on the
 	probability that S is greater than or equal to x applies:
 	
 		P( S >= x )   <=   1 - exp [ - KN exp ( - lambda * x ) ].
 	
 	In other words, the p-value for this segment can be written as
 	1-exp[-KN*exp(-lambda*S)].

 	This formula can be applied to pairwise sequence comparison by assigning
 	scores to pairs of letters (e.g. amino acids), and by replacing N in the
 	formula with N*M, where N and M are the lengths of the two sequences
 	being compared.

 	In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
 	distinct segments all with score >= S is given by:

                               2             m-1           -y
 		1 - [ 1 + y + y /2! + ... + y   /(m-1)! ] e

 	Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
 	as the previous formula.

 *******************************************************************************/

 #define MAXIT 150	/* Maximum number of iterations used in calculating K */

 karlin(low,high,pr,lambda,K)
 	int low;			/* Lowest score (must be negative)    */
 	int high;			/* Highest score (must be positive)   */
 	double *pr;			/* Probabilities for various scores   */
 	double *lambda;			/* Pointer to parameter lambda        */
 	double *K;			/* Pointer to parmeter K              */
 {
 	int i,j,range,lo,hi,first,last;
 	double up,new,Sum,av;
 	register double sum;
 	double *p,*P,*ptrP,exp();
 	register double *ptr1,*ptr2,*ptr1e;
 	char *calloc();

 	/* Check that scores and their associated probabilities are valid     */

 	if (low>=0) {
 		fprintf(stderr,"Lowest score must be negative.\n");
 		return 0;
 	}
 	for (i=range=high-low;i> -low && !pr[i];--i);
 	if (i<= -low) {
 		fprintf(stderr,"A positive score must be possible.\n");
 		return 0;
 	}
 	for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) {
 		fprintf(stderr,"Negative probabilities not allowed.\n");
 		return 0;
 	}
 	if (sum<0.99995 || sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f.  Normalizing.\n",sum);
 	p= (double *) calloc(range+1,sizeof(double));
 	for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum);
 	if (Sum>=0) {
 		fprintf(stderr,"Invalid (non-negative) expected score:  %.3f\n",Sum);
 		free(p);
 		return 0;
 	}

 	/* Calculate the parameter lambda */

 	up=0.5;
 	do {
 		up*=2;
 		ptr1=p;
 		for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(up*i);
 	}
 	while (sum<1.0);
 	for (*lambda=j=0;j<25;++j) {
 		new=(*lambda+up)/2.0;
 		ptr1=p;
 		for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(new*i);
 		if (sum>1.0) up=new;
 		else *lambda=new;
 	}

 	/* Calculate the pamameter K */

 	ptr1=p;
 	for (av=0,i=low;i<=high;++i) av+= *ptr1++ *i*exp(*lambda*i);
 	if (low == -1 || high == 1) {
 		*K = high==1 ? av : Sum*Sum/av;
 		*K *= 1.0 - exp(- *lambda);
 		free(p);
 		return 1;	/* Parameters calculated successfully */
 	}
 	Sum=lo=hi=0;
 	P= (double *) calloc(MAXIT*range+1,sizeof(double));
 	for (*P=sum=j=1;j<=MAXIT && sum>0.00001;Sum+=sum/=j++) {
 		first=last=range;
 		for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) {
 			ptr1=ptrP-first;
 			ptr1e=ptrP-last;
 			ptr2=p+first;
 			for (sum=0;ptr1>=ptr1e;) sum+= *ptr1-- * *ptr2++;
 			if (first) --first;
 			if (ptrP-P<=range) --last;
 		}
 		for (sum=0,i=lo;i;++i) sum+= *++ptrP * exp(*lambda*i);
 		for (;i<=hi;++i) sum+= *++ptrP;
 	}
 	if (j>MAXIT) fprintf(stderr,"Value for K may be too large due to insufficient iterations.\n");
 	for (i=low;!p[i-low];++i);
 	for (j= -i;i<high && j>1;) if (p[++i-low]) j=gcd(j,i);
 	*K = (j*exp(-2*Sum))/(av*(1.0-exp(- *lambda*j)));

 	free(p);
 	free(P);
 	return 1;		/* Parameters calculated successfully */
 }

 gcd(a,b)
 	int a,b;
 {
 	int c;

 	if (b<0) b= -b;
 	if (b>a) { c=a; a=b; b=c; }
 	for (;b;b=c) { c=a%b; a=b; }
 	return a;
 }
 /* ---------------------------------------------------------------------------*/

 /* print_msp - display the i-th MSP */
 print_msp(i)
 int i;
 {
 	long len, pos1, pos2;
 	Msp_ptr mp = msp[i];
 	int j, start_j, W, X, sc;
 	double prob, significance();
 	
 	
 	pos1 = mp->pos1;
 	pos2 = mp->pos2;
 	len = mp->len;
 	prob = significance(mp->score);
 	if (!alignments || print_stats) {
 		if (prob > .99)
 			++sig99;
 		if (prob > .90)
 			++sig90;
 		if (prob > .50)
 			++sig50;

 		for (W = start_j = j = 0; j < len; ++j)
 			if (seq1[pos1+j] != seq2[pos2+j]) {
 				W = max(W, j-start_j);
 				start_j = j+1;
 			}
 		W = max(W, len-start_j);

 		for (j = 4; j <= 12; ++j) {
 			if (prob > .99 && W < j)
 				++missW[j][0];
 			if (prob > .90 && W < j)
 				++missW[j][1];
 			if (prob > .50 && W < j)
 				++missW[j][2];
 		}

 		for (sc = X = j = 0; j < len; ++j)
 			if (sc < 0) {
 				sc += SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]);
 				if (sc < X)
 					X = sc;
 			} else
 				sc = SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]);
 		X = -X;
 		for (j = 1; j <= 20; ++j) {
 			if (prob > .99 && X > j)
 				++missX[j][0];
 			if (prob > .90 && X > j)
 				++missX[j][1];
 			if (prob > .50 && X > j)
 				++missX[j][2];
 		}

 	}
 	if (!alignments) {
 		printf("%4d: %6d %7d %5d %6d  %5.1lf%% %4d %4d\n",
 			i+1, pos1+1, pos2+1, len, mp->score, 100*prob, W, X);
 		return;
 	}
 	printf("a {\n  s %d\n  b %d %d\n  e %d %d\n  l %d %d %d %d %1.1lf\n}\n",
 	  mp->score, pos1+1, pos2+1, pos1+len, pos2+len,
 	  pos1+1, pos2+1, pos1+len, pos2+len, 100*prob);
 }

 double significance(score)
 int score;
 {
 	double y, exp(), N;
 	
 	N = ((double)len1)*((double)len2);
 	y = -param_lambda*(score);
 	y = param_K*N*exp(y);
 	return exp(-y);
 }

 /* ----------------------------   utility routines   -------------------------*/

 /* fatal - print message and die */
 fatal(msg)
 char *msg;
 {
 	fprintf(stderr, "%s\n", msg);
 	exit(1);
 }

 /* fatalf - format message, print it, and die */
 fatalf(msg, val)
 char *msg, *val;
 {
 	fprintf(stderr, msg, val);
 	putc('\n', stderr);
 	exit(1);
 }
 	
 /* ckopen - open file; check for success */
 FILE *ckopen(name, mode)
 char *name, *mode;
 {
 	FILE *fopen(), *fp;

 	if ((fp = fopen(name, mode)) == NULL)
 		fatalf("Cannot open %s.", name);
 	return(fp);
 }

 /* ckalloc - allocate space; check for success */
 char *ckalloc(amount)
 int amount;
 {
 	char *malloc(), *p;

 	if ((p = malloc( (unsigned) amount)) == NULL)
 		fatal("Ran out of memory.");
 	return(p);
 }
	/*
	* BLAST - Search two DNA sequences for locally maximal segment pairs. The basic
	* command syntax is
	*
	* BLAST sequence1 sequence2
	*
	* where sequence1 and sequence2 name files containing DNA sequences. Lines
	* at the beginnings of the files that don't start with 'A', 'C', 'T' or 'G'
	* are discarded. Thus a typical sequence file might begin:
	*
	* >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end.
	* GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
	* GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
	*
	* If sequence1 and sequence2 are identical file names, then matches are
	* computed without "mirror image" pairs and without the trivial long match.
	*
	* The BLAST command permits optional additional arguments, such as X=50,
	* that reset certain internal parameters. The available parameters are:
	*
	* M or m gives the score for a match.
	* I or i gives the score for a transition (A <--> G, C <--> T).
	* V or v gives the score for a tranversion (non-transition change).
	* W or w gives the word size.
	* X or x gives the value for terminating word extensions.
	* K or k give the cutoff.
	* (defaults: M = 1, I = -1, V = -1, W = 8, X = 10*M,
	* and K = 5% significance level)
	*
	* If W is set to 0, then all locally maximum segment pairs (LMSPs) are
	* computed; in this case the value of X is irrelevant.
	*
	* In addition, the word "noalign" requests that no alignments be printed
	* (summary statistics for each locally maximal segment pair are reported)
	* and the word "stats" requests that statistics concerning the performance
	* of BLAST be printed. Thus a typical command is
	*
	* BLAST SEQ1 SEQ2 M=1 I=0 V=-1 noalign
	*/
	#include <stdio.h>
	#include <math.h>
	#include <ctype.h>

	#define max(x,y) ((x > y) ? (x) : (y))
	#define min(x,y) ((x < y) ? (x) : (y))
	#define SUBSTITUTION_SCORE(x,y) S[x][y]

	#define DEFAULT_M 1 /* default score for match */
	#define DEFAULT_I -1 /* default score for transition */
	#define DEFAULT_V -1 /* default score for transversion */
	#define DEFAULT_W 8 /* default word size */
	#define DEFAULT_SIG 0.05 /* default significance level for setting K */

	char
	S[128][128],
	s1, seq1, s2, seq2;
	int
	alignments,
	print_stats,
	K, /* cutoff score */
	M, /* score for a match */
	I, /* score for a transition */
	V, /* score for a transversion */
	W, /* word length */
	X, /* cutoff for hit extensions */
	sig99, sig90, sig50, /* number of MPSs above given significance */
	numMSPs, /* total number of MSPs recorded */
	numhits, /* number of word hits */
	missW[15][3],
	missX[25][3];

	double
	param_K, param_lambda;

	long
	diag_lev, / extent of discovered matches on a given diagonal */
	len1, len2; /* sequence lengths */


	typedef struct msp {
	long len, pos1, pos2;
	int score;
	struct msp *next_msp;
	} Msp, *Msp_ptr;

	Msp_ptr msp_list, *msp;

	main(argc, argv)
	int argc;
	char *argv[];
	{
	int i, v;
	double x, log(), sqrt(), significance();
	char *ckalloc();

	if (argc < 3)
	fatalf("%s seq1 seq2 [M=] [I=] [V=] [K=] [W=] [X=] [noalign] [stats]",
	argv[0]);
	M = DEFAULT_M;
	I = DEFAULT_I;
	V = DEFAULT_V;
	W = DEFAULT_W;
	X = -1;
	alignments = 1;
	print_stats = 0;
	while (--argc > 2) {
	if (strcmp(argv[argc],"noalign") == 0) {
	alignments = 0;
	continue;
	}
	if (strcmp(argv[argc],"stats") == 0) {
	print_stats = 1;
	continue;
	}
	if (argv[argc][1] != '=')
	fatalf("argument %d has improper form", argc);
	v = atoi(argv[argc] + 2);
	switch (argv[argc][0]) {
	case 'M':
	case 'm':
	M = v;
	break;
	case 'I':
	case 'i':
	I = v;
	break;
	case 'V':
	case 'v':
	V = v;
	break;
	case 'K':
	case 'k':
	K = v;
	break;
	case 'W':
	case 'w':
	W = v;
	break;
	case 'X':
	case 'x':
	X = v;
	break;
	default:
	fatal("Options are M, I, V, K, W, X.");
	}
	}

	if (X == -1)
	X = 10*M;
	if (V > I)
	fatal("transversions score higher than transitions?");

	len1 = get_seq(argv[1], &seq1);
	if (strcmp(argv[1],argv[2]) == 0) {
	if (W == 0)
	fatal("file1 = file2 not implemented when W=0");
	seq2 = seq1;
	len2 = len1;
	} else
	len2 = get_seq(argv[2], &seq2);
	s1 = seq1 - 1; /* so subscripts start with 1 */
	s2 = seq2 - 1;
	diag_lev = (long )ckalloc((len1+len2+1)sizeof(long)) + len1;

	/* set up scoring matrix */
	S['A']['A'] = S['C']['C'] = S['G']['G'] = S['T']['T'] = M;
	S['A']['G'] = S['G']['A'] = S['C']['T'] = S['T']['C'] = I;
	S['A']['C'] = S['A']['T'] = S['C']['A'] = S['C']['G'] =
	S['G']['C'] = S['G']['T'] = S['T']['A'] = S['T']['G'] = V;

	get_params();
	if (K == 0) {
	x = log(DEFAULT_SIG)/(-param_Klen1len2);
	K = 2*log(sqrt(x))/(-param_lambda);
	while (significance(K-1) >= DEFAULT_SIG)
	--K;
	while (significance(K) < DEFAULT_SIG)
	++K;
	}

	if (alignments) {
	printf("#:lav\n\n");
	printf("d {\n\"BLAST output with parameters:\n");
	printf("M = %d, I = %d, V = %d\n", M, I, V);
	if (W > 0)
	printf("W = %d, X = %d\"", W, X);
	else
	printf("W = 0 (computes all LMSPs)\"");
	printf("\n}\n");
	printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n",
	argv[1], len1, argv[2], len2);
	printf("k {\n \"significance\"\n}\n");
	}

	if (W > 0) {
	bld_table();
	search();
	} else
	get_msps();

	sort_msps();

	if (!alignments)
	printf("\n\n Seq. 1 Seq. 2 len score signif W X\n");
	/* print the matches for each sequence */
	for (i = 0; i < numMSPs; ++i)
	print_msp(i);

	if (W == 0 && print_stats) {
	printf("\n%3d MSPs above significance level 0.99\n", sig99);
	printf("%3d MSPs above significance level 0.90\n", sig90);
	printf("%3d MSPs above significance level 0.50\n", sig50);
	printf("\nPercent of MSPs missed by BLAST at various");
	printf(" W and significance levels:\n");
	printf(" W: 0.99 0.90 0.50\n");
	for (i = 4; i <= 12; ++i)
	printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n",
	i,
	100*missW[i][0]/((float)sig99),
	100*missW[i][1]/((float)sig90),
	100*missW[i][2]/((float)sig50));
	printf("\nPercent of MSPs missed by BLAST at various");
	printf(" X and significance levels:\n");
	printf(" X: 0.99 0.90 0.50\n");
	for (i = 1; i <= 20; ++i)
	printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n",
	i,
	100*missX[i][0]/((float)sig99),
	100*missX[i][1]/((float)sig90),
	100*missX[i][2]/((float)sig50));
	}
	if (W > 0 && print_stats) {
	printf("\n\nStatistics:\n");
	printf(" %s: M = %d, I = %d, V = %d, K = %d, W = %d, X = %d\n",
	argv[0], M, I, V, K, W, X);
	printf(" # of word hits = %d\n", numhits);
	printf(" # of matches found = %d\n", numMSPs);
	}

	if (strcmp(argv[1], argv[2]) == 0) {
	/* self-comparison; insert trivial diagonal */
	printf("a {\n s 0.0\n b 1 1\n e %d %d\n", len1, len1);
	printf(" l 1 1 %d %d 0.0\n}\n", len1, len1);
	}

	exit (0);
	}

	/* get_seq - read a sequence */
	int get_seq(file_name, seqptr)
	char file_name, *seqptr;
	{
	FILE qp, ckopen();
	char p, fgets(), strchr(), ckalloc();
	int c;
	long n;

	qp = ckopen(file_name, "r");
	for (n = 0; (c = getc(qp)) != EOF; )
	if (c != '\n')
	++n;
	*seqptr = ckalloc(n+1);
	rewind(qp);
	if (n == 0)
	return 0;
	for (p = *seqptr; ; ) {
	if (fgets(p, (int)n+1, qp) == NULL) {
	fclose(qp);
	*p = '\0';
	break;
	}
	if (p == seqptr && strchr("ACTG", p) == NULL)
	continue;
	while (isupper(*p))
	++p;
	if (p != '\n' && p != '\0')
	fatalf("Illegal character %c in query sequence.", *p);
	}
	return p - *seqptr;
	}

	/* ------------- build table of W-tuples in the first sequence ------------*/

	/* The data structure could be simplified if we limited W to, say, W <= 8. */

	struct hash_node {
	long ecode; /* integer encoding of the word */
	int pos; /* positions where word hits query sequence */
	struct hash_node link; / next word with same last 7.5 letters */
	};

	#define HASH_SIZE 32767 /* 2*15 - 1 /

	struct hash_node *hashtab[HASH_SIZE+1];
	int encoding[128];
	int mask;
	long *next_pos;

	bld_table()
	{
	long ecode;
	int i;
	char q, ckalloc();

	/* perform initializations */
	encoding['C'] = 1;
	encoding['G'] = 2;
	encoding['T'] = 3;
	mask = (1 << (W+W-2)) - 1;
	next_pos = (long )ckalloc(len1sizeof(long));

	q = seq1 - 1;
	ecode = 0L;
	for (i = 1; i < W; ++i)
	ecode = (ecode << 2) + encoding[*++q];
	while (*++q) {
	ecode = ((ecode & mask) << 2) + encoding[*q];
	add_word(ecode, (long)(q - seq1 +1));
	}
	}

	/* add_word - add a word to the table of critical words */
	add_word(ecode, pos)
	long ecode, pos;
	{
	struct hash_node *h;
	long hval;
	char *ckalloc();

	hval = ecode & HASH_SIZE;
	for (h = hashtab[hval]; h; h = h->link)
	if (h->ecode == ecode)
	break;
	if (h == NULL) {
	h = (struct hash_node *) ckalloc (sizeof(struct hash_node));
	h->link = hashtab[hval];
	hashtab[hval] = h;
	h->ecode = ecode;
	h->pos = -1;
	}
	next_pos[pos] = h->pos;
	h->pos = pos;
	}

	/* ----------------------- search the second sequence --------------------*/

	search()
	{
	register struct hash_node *h;
	register char *s;
	register long ecode, hval;
	int i;
	long p;

	s = seq2 - 1;
	ecode = 0L;
	for (i = 1; i < W; ++i)
	ecode = (ecode << 2) + encoding[*++s];
	while (*++s) {
	ecode = ((ecode & mask) << 2) + encoding[*s];
	hval = ecode & HASH_SIZE;
	for (h = hashtab[hval]; h; h = h->link)
	if (h->ecode == ecode) {
	for (p = h->pos; p >= 0; p = next_pos[p])
	extend_hit(p, (long)(s+1-seq2));
	break;
	}
	}
	}

	/* extend_hit - extend a word-sized hit to a longer match */
	extend_hit(pos1, pos2)
	long pos1, pos2;
	{
	char beg2, beg1, end1, q, *s;
	long min_sum, max_sum, sum, diag, score;
	Msp_ptr mp;

	if (seq1 == seq2 && pos1 >= pos2)
	return;

	numhits++;

	diag = pos2 - pos1;
	if (diag_lev[diag] > pos1)
	return;

	/* extend to the right */
	max_sum = sum = 0;
	end1 = q = seq1 + pos1;
	s = seq2 + pos2;
	while (s != '\0' && q != '\0' && sum >= max_sum - X) {
	sum += SUBSTITUTION_SCORE(s++,q++);
	if (sum > max_sum) {
	max_sum = sum;
	end1 = q;
	}
	}

	/* extend to the left */
	min_sum = sum = 0;
	beg1 = q = (seq1 + pos1) - W;
	beg2 = s = seq2 + pos2 - W;
	while (s > seq2 && q > seq1 && sum >= min_sum - X) {
	sum += SUBSTITUTION_SCORE(--s,--q);
	if (sum > min_sum) {
	min_sum = sum;
	beg2 = s;
	beg1 = q;
	}
	}

	score = M*W + max_sum + min_sum;

	if (score >= K) {
	++numMSPs;
	mp = (Msp_ptr)ckalloc(sizeof(Msp));
	mp->len = end1 - beg1;
	mp->pos1 = beg1 - seq1;
	mp->pos2 = beg2 - seq2;
	mp->score = score;
	mp->next_msp = msp_list;
	msp_list = mp;
	}

	diag_lev[diag] = (end1 - seq1) + W;
	}

	/* ---------------- compute locally maximal sequence pairs ---------------*/

	get_msps()
	{
	long d, i, score, best_score, end1, beg1, lower, upper;

	for (d = 1-len1; d < len2; ++d) {
	beg1 = best_score = score = 0;
	lower = max(1, 1-d);
	upper = min(len1, len2-d);
	for (i = lower; i <= upper; ++i)
	if (score > 0) {
	score += SUBSTITUTION_SCORE(s1[i], s2[i+d]);
	if (score > best_score) {
	best_score = score;
	end1 = i;
	}
	} else {
	check_score(best_score, d, beg1, end1);
	best_score = score =
	SUBSTITUTION_SCORE(s1[i], s2[i+d]);
	beg1 = end1 = i;
	}
	check_score(best_score, d, beg1, end1);
	}
	}

	check_score(score, d, beg1, end1)
	long score, d, beg1, end1;
	{
	char *ckalloc();
	Msp_ptr mp;

	if (score >= K) {
	++numMSPs;
	mp = (Msp_ptr)ckalloc(sizeof(Msp));
	mp->len = end1 - beg1 + 1;
	mp->pos1 = beg1 - 1;
	mp->pos2 = beg1 + d - 1;
	mp->score = score;
	mp->next_msp = msp_list;
	msp_list = mp;
	}
	}


	/* sort_msps - order database sequence for printing */
	sort_msps()
	{
	char *ckalloc();
	int i;
	Msp_ptr mp;

	msp = (Msp_ptr ) ckalloc(numMSPssizeof(Msp_ptr));
	for (i = 0, mp = msp_list; i < numMSPs; ++i, mp = mp->next_msp)
	msp[i] = mp;
	for (i = (numMSPs/2) - 1; i >= 0; --i)
	heapify(i, (int) numMSPs-1);
	for (i = numMSPs-1; i > 0; --i) {
	mp = msp[0];
	msp[0] = msp[i];
	msp[i] = mp;
	if (i > 1)
	heapify(0, i-1);
	}
	}

	/* heapify - core procedure for heapsort */
	heapify(i, last)
	int i, last;
	{
	int lim = (last-1)/2, left_son, small_son;
	Msp_ptr mp;

	while (i <= lim) {
	left_son = 2*i + 1;
	if (left_son == last)
	small_son = left_son;
	else
	small_son = smaller(left_son, left_son+1);
	if (smaller(i, small_son) == small_son) {
	mp = msp[i];
	msp[i] = msp[small_son];
	msp[small_son] = mp;
	i = small_son;
	} else
	break;
	}
	}

	/* smaller - determine which of two sequences should be printed first */
	int smaller(i, j)
	int i, j;
	{
	Msp_ptr ki = msp[i], kj = msp[j];

	if (ki->score == kj->score)
	return ((ki->pos1 >= kj->pos1) ? i : j);
	/* print one with larger score first */
	return ((ki->score <= kj->score) ? i : j);
	}

	/* -------------------------- compute significance parameters ---------------*/

	get_params()
	{
	double _p[1000], *p , log(), sqrt(), pMatch, pTransit, pTransver, N;
	long A1, A2, C1, C2, G1, G2, T1, T2;
	int c, i;

	A1 = A2 = C1 = C2 = G1 = G2 = T1 = T2 = 0;

	for (i = 0; i < len1; ++i)
	if ((c = seq1[i]) == 'A')
	++A1;
	else if (c == 'C')
	++C1;
	else if (c == 'G')
	++G1;
	else if (c == 'T')
	++T1;
	for (i = 0; i < len2; ++i)
	if ((c = seq2[i]) == 'A')
	++A2;
	else if (c == 'C')
	++C2;
	else if (c == 'G')
	++G2;
	else if (c == 'T')
	++T2;
	N = ((double)len1)*((double)len2);

	pMatch = ((double)A1)*((double)A2)
	+ ((double)C1)*((double)C2)
	+ ((double)G1)*((double)G2)
	+ ((double)T1)*((double)T2);
	pTransit = ((double)A1)*((double)G2)
	+ ((double)C1)*((double)T2)
	+ ((double)G1)*((double)A2)
	+ ((double)T1)*((double)C2);
	pTransver = N - pMatch - pTransit;

	for (i = 0; i < V+M; ++i)
	_p[i] = 0.0;
	p = _p - V;
	p[V] = pTransver/N;
	p[I] += (pTransit/N);
	p[M] += (pMatch/N);

	if (karlin(V, M, _p, &param_lambda, &param_K) == 0)
	fatal("compution of significance levels failed");
	}

	/************** Statistical Significance Parameter Subroutine **************

	Version 1.0 February 2, 1990
	Version 1.1 July 5, 1990

	Program by: Stephen Altschul

	Address: National Center for Biotechnology Information
	National Library of Medicine
	National Institutes of Health
	Bethesda, MD 20894

	Internet: [email protected]

	See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
	Significance of Molecular Sequence Features by Using General Scoring
	Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.

	Computes the parameters lambda and K for use in calculating the
	statistical significance of high-scoring segments or subalignments.

	The scoring scheme must be integer valued. A positive score must be
	possible, but the expected (mean) score must be negative.

	A program that calls this routine must provide the value of the lowest
	possible score, the value of the greatest possible score, and a pointer
	to an array of probabilities for the occurence of all scores between
	these two extreme scores. For example, if score -2 occurs with
	probability 0.7, score 0 occurs with probability 0.1, and score 3
	occurs with probability 0.2, then the subroutine must be called with
	low = -2, high = 3, and pr pointing to the array of values
	{ 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
	pointers to lambda and K; the subroutine will then calculate the values
	of these two parameters. In this example, lambda=0.330 and K=0.154.

	The parameters lambda and K can be used as follows. Suppose we are
	given a length N random sequence of independent letters. Associated
	with each letter is a score, and the probabilities of the letters
	determine the probability for each score. Let S be the aggregate score
	of the highest scoring contiguous segment of this sequence. Then if N
	is sufficiently large (greater than 100), the following bound on the
	probability that S is greater than or equal to x applies:

	P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].

	In other words, the p-value for this segment can be written as
	1-exp[-KNexp(-lambdaS)].

	This formula can be applied to pairwise sequence comparison by assigning
	scores to pairs of letters (e.g. amino acids), and by replacing N in the
	formula with N*M, where N and M are the lengths of the two sequences
	being compared.

	In addition, letting y = KNexp(-lambdaS), the p-value for finding m
	distinct segments all with score >= S is given by:

	2 m-1 -y
	1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e

	Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
	as the previous formula.

	*******************************************************************************/

	#define MAXIT 150 /* Maximum number of iterations used in calculating K */

	karlin(low,high,pr,lambda,K)
	int low; /* Lowest score (must be negative) */
	int high; /* Highest score (must be positive) */
	double pr; / Probabilities for various scores */
	double lambda; / Pointer to parameter lambda */
	double K; / Pointer to parmeter K */
	{
	int i,j,range,lo,hi,first,last;
	double up,new,Sum,av;
	register double sum;
	double p,P,*ptrP,exp();
	register double ptr1,ptr2,*ptr1e;
	char *calloc();

	/* Check that scores and their associated probabilities are valid */

	if (low>=0) {
	fprintf(stderr,"Lowest score must be negative.\n");
	return 0;
	}
	for (i=range=high-low;i> -low && !pr[i];--i);
	if (i<= -low) {
	fprintf(stderr,"A positive score must be possible.\n");
	return 0;
	}
	for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) {
	fprintf(stderr,"Negative probabilities not allowed.\n");
	return 0;
	}
	if (sum<0.99995 \|\| sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f. Normalizing.\n",sum);
	p= (double *) calloc(range+1,sizeof(double));
	for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum);
	if (Sum>=0) {
	fprintf(stderr,"Invalid (non-negative) expected score: %.3f\n",Sum);
	free(p);
	return 0;
	}

	/* Calculate the parameter lambda */

	up=0.5;
	do {
	up*=2;
	ptr1=p;
	for (sum=0,i=low;i<=high;++i) sum+= ptr1++ exp(up*i);
	}
	while (sum<1.0);
	for (*lambda=j=0;j<25;++j) {
	new=(*lambda+up)/2.0;
	ptr1=p;
	for (sum=0,i=low;i<=high;++i) sum+= ptr1++ exp(new*i);
	if (sum>1.0) up=new;
	else *lambda=new;
	}

	/* Calculate the pamameter K */

	ptr1=p;
	for (av=0,i=low;i<=high;++i) av+= ptr1++ iexp(lambda*i);
	if (low == -1 \|\| high == 1) {
	K = high==1 ? av : SumSum/av;
	K = 1.0 - exp(- *lambda);
	free(p);
	return 1; /* Parameters calculated successfully */
	}
	Sum=lo=hi=0;
	P= (double ) calloc(MAXITrange+1,sizeof(double));
	for (*P=sum=j=1;j<=MAXIT && sum>0.00001;Sum+=sum/=j++) {
	first=last=range;
	for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) {
	ptr1=ptrP-first;
	ptr1e=ptrP-last;
	ptr2=p+first;
	for (sum=0;ptr1>=ptr1e;) sum+= ptr1-- *ptr2++;
	if (first) --first;
	if (ptrP-P<=range) --last;
	}
	for (sum=0,i=lo;i;++i) sum+= ++ptrP exp(lambdai);
	for (;i<=hi;++i) sum+= *++ptrP;
	}
	if (j>MAXIT) fprintf(stderr,"Value for K may be too large due to insufficient iterations.\n");
	for (i=low;!p[i-low];++i);
	for (j= -i;i<high && j>1;) if (p[++i-low]) j=gcd(j,i);
	K = (jexp(-2Sum))/(av(1.0-exp(- lambdaj)));

	free(p);
	free(P);
	return 1; /* Parameters calculated successfully */
	}

	gcd(a,b)
	int a,b;
	{
	int c;

	if (b<0) b= -b;
	if (b>a) { c=a; a=b; b=c; }
	for (;b;b=c) { c=a%b; a=b; }
	return a;
	}
	/* ---------------------------------------------------------------------------*/

	/* print_msp - display the i-th MSP */
	print_msp(i)
	int i;
	{
	long len, pos1, pos2;
	Msp_ptr mp = msp[i];
	int j, start_j, W, X, sc;
	double prob, significance();


	pos1 = mp->pos1;
	pos2 = mp->pos2;
	len = mp->len;
	prob = significance(mp->score);
	if (!alignments \|\| print_stats) {
	if (prob > .99)
	++sig99;
	if (prob > .90)
	++sig90;
	if (prob > .50)
	++sig50;

	for (W = start_j = j = 0; j < len; ++j)
	if (seq1[pos1+j] != seq2[pos2+j]) {
	W = max(W, j-start_j);
	start_j = j+1;
	}
	W = max(W, len-start_j);

	for (j = 4; j <= 12; ++j) {
	if (prob > .99 && W < j)
	++missW[j][0];
	if (prob > .90 && W < j)
	++missW[j][1];
	if (prob > .50 && W < j)
	++missW[j][2];
	}

	for (sc = X = j = 0; j < len; ++j)
	if (sc < 0) {
	sc += SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]);
	if (sc < X)
	X = sc;
	} else
	sc = SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]);
	X = -X;
	for (j = 1; j <= 20; ++j) {
	if (prob > .99 && X > j)
	++missX[j][0];
	if (prob > .90 && X > j)
	++missX[j][1];
	if (prob > .50 && X > j)
	++missX[j][2];
	}

	}
	if (!alignments) {
	printf("%4d: %6d %7d %5d %6d %5.1lf%% %4d %4d\n",
	i+1, pos1+1, pos2+1, len, mp->score, 100*prob, W, X);
	return;
	}
	printf("a {\n s %d\n b %d %d\n e %d %d\n l %d %d %d %d %1.1lf\n}\n",
	mp->score, pos1+1, pos2+1, pos1+len, pos2+len,
	pos1+1, pos2+1, pos1+len, pos2+len, 100*prob);
	}

	double significance(score)
	int score;
	{
	double y, exp(), N;

	N = ((double)len1)*((double)len2);
	y = -param_lambda*(score);
	y = param_KNexp(y);
	return exp(-y);
	}

	/* ---------------------------- utility routines -------------------------*/

	/* fatal - print message and die */
	fatal(msg)
	char *msg;
	{
	fprintf(stderr, "%s\n", msg);
	exit(1);
	}

	/* fatalf - format message, print it, and die */
	fatalf(msg, val)
	char msg, val;
	{
	fprintf(stderr, msg, val);
	putc('\n', stderr);
	exit(1);
	}

	/* ckopen - open file; check for success */
	FILE *ckopen(name, mode)
	char name, mode;
	{
	FILE fopen(), fp;

	if ((fp = fopen(name, mode)) == NULL)
	fatalf("Cannot open %s.", name);
	return(fp);
	}

	/* ckalloc - allocate space; check for success */
	char *ckalloc(amount)
	int amount;
	{
	char malloc(), p;

	if ((p = malloc( (unsigned) amount)) == NULL)
	fatal("Ran out of memory.");
	return(p);
	}