Last active
October 14, 2020 17:29
-
-
Save jxtx/b1c170bc5a35487c35fb to your computer and use it in GitHub Desktop.
Oldest nucleotide blast.c I can find...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* BLAST - Search two DNA sequences for locally maximal segment pairs. The basic | |
* command syntax is | |
* | |
* BLAST sequence1 sequence2 | |
* | |
* where sequence1 and sequence2 name files containing DNA sequences. Lines | |
* at the beginnings of the files that don't start with 'A', 'C', 'T' or 'G' | |
* are discarded. Thus a typical sequence file might begin: | |
* | |
* >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end. | |
* GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT | |
* GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC | |
* | |
* If sequence1 and sequence2 are identical file names, then matches are | |
* computed without "mirror image" pairs and without the trivial long match. | |
* | |
* The BLAST command permits optional additional arguments, such as X=50, | |
* that reset certain internal parameters. The available parameters are: | |
* | |
* M or m gives the score for a match. | |
* I or i gives the score for a transition (A <--> G, C <--> T). | |
* V or v gives the score for a tranversion (non-transition change). | |
* W or w gives the word size. | |
* X or x gives the value for terminating word extensions. | |
* K or k give the cutoff. | |
* (defaults: M = 1, I = -1, V = -1, W = 8, X = 10*M, | |
* and K = 5% significance level) | |
* | |
* If W is set to 0, then all locally maximum segment pairs (LMSPs) are | |
* computed; in this case the value of X is irrelevant. | |
* | |
* In addition, the word "noalign" requests that no alignments be printed | |
* (summary statistics for each locally maximal segment pair are reported) | |
* and the word "stats" requests that statistics concerning the performance | |
* of BLAST be printed. Thus a typical command is | |
* | |
* BLAST SEQ1 SEQ2 M=1 I=0 V=-1 noalign | |
*/ | |
#include <stdio.h> | |
#include <math.h> | |
#include <ctype.h> | |
#define max(x,y) ((x > y) ? (x) : (y)) | |
#define min(x,y) ((x < y) ? (x) : (y)) | |
#define SUBSTITUTION_SCORE(x,y) S[x][y] | |
#define DEFAULT_M 1 /* default score for match */ | |
#define DEFAULT_I -1 /* default score for transition */ | |
#define DEFAULT_V -1 /* default score for transversion */ | |
#define DEFAULT_W 8 /* default word size */ | |
#define DEFAULT_SIG 0.05 /* default significance level for setting K */ | |
char | |
S[128][128], | |
*s1, *seq1, *s2, *seq2; | |
int | |
alignments, | |
print_stats, | |
K, /* cutoff score */ | |
M, /* score for a match */ | |
I, /* score for a transition */ | |
V, /* score for a transversion */ | |
W, /* word length */ | |
X, /* cutoff for hit extensions */ | |
sig99, sig90, sig50, /* number of MPSs above given significance */ | |
numMSPs, /* total number of MSPs recorded */ | |
numhits, /* number of word hits */ | |
missW[15][3], | |
missX[25][3]; | |
double | |
param_K, param_lambda; | |
long | |
*diag_lev, /* extent of discovered matches on a given diagonal */ | |
len1, len2; /* sequence lengths */ | |
typedef struct msp { | |
long len, pos1, pos2; | |
int score; | |
struct msp *next_msp; | |
} Msp, *Msp_ptr; | |
Msp_ptr msp_list, *msp; | |
main(argc, argv) | |
int argc; | |
char *argv[]; | |
{ | |
int i, v; | |
double x, log(), sqrt(), significance(); | |
char *ckalloc(); | |
if (argc < 3) | |
fatalf("%s seq1 seq2 [M=] [I=] [V=] [K=] [W=] [X=] [noalign] [stats]", | |
argv[0]); | |
M = DEFAULT_M; | |
I = DEFAULT_I; | |
V = DEFAULT_V; | |
W = DEFAULT_W; | |
X = -1; | |
alignments = 1; | |
print_stats = 0; | |
while (--argc > 2) { | |
if (strcmp(argv[argc],"noalign") == 0) { | |
alignments = 0; | |
continue; | |
} | |
if (strcmp(argv[argc],"stats") == 0) { | |
print_stats = 1; | |
continue; | |
} | |
if (argv[argc][1] != '=') | |
fatalf("argument %d has improper form", argc); | |
v = atoi(argv[argc] + 2); | |
switch (argv[argc][0]) { | |
case 'M': | |
case 'm': | |
M = v; | |
break; | |
case 'I': | |
case 'i': | |
I = v; | |
break; | |
case 'V': | |
case 'v': | |
V = v; | |
break; | |
case 'K': | |
case 'k': | |
K = v; | |
break; | |
case 'W': | |
case 'w': | |
W = v; | |
break; | |
case 'X': | |
case 'x': | |
X = v; | |
break; | |
default: | |
fatal("Options are M, I, V, K, W, X."); | |
} | |
} | |
if (X == -1) | |
X = 10*M; | |
if (V > I) | |
fatal("transversions score higher than transitions?"); | |
len1 = get_seq(argv[1], &seq1); | |
if (strcmp(argv[1],argv[2]) == 0) { | |
if (W == 0) | |
fatal("file1 = file2 not implemented when W=0"); | |
seq2 = seq1; | |
len2 = len1; | |
} else | |
len2 = get_seq(argv[2], &seq2); | |
s1 = seq1 - 1; /* so subscripts start with 1 */ | |
s2 = seq2 - 1; | |
diag_lev = (long *)ckalloc((len1+len2+1)*sizeof(long)) + len1; | |
/* set up scoring matrix */ | |
S['A']['A'] = S['C']['C'] = S['G']['G'] = S['T']['T'] = M; | |
S['A']['G'] = S['G']['A'] = S['C']['T'] = S['T']['C'] = I; | |
S['A']['C'] = S['A']['T'] = S['C']['A'] = S['C']['G'] = | |
S['G']['C'] = S['G']['T'] = S['T']['A'] = S['T']['G'] = V; | |
get_params(); | |
if (K == 0) { | |
x = log(DEFAULT_SIG)/(-param_K*len1*len2); | |
K = 2*log(sqrt(x))/(-param_lambda); | |
while (significance(K-1) >= DEFAULT_SIG) | |
--K; | |
while (significance(K) < DEFAULT_SIG) | |
++K; | |
} | |
if (alignments) { | |
printf("#:lav\n\n"); | |
printf("d {\n\"BLAST output with parameters:\n"); | |
printf("M = %d, I = %d, V = %d\n", M, I, V); | |
if (W > 0) | |
printf("W = %d, X = %d\"", W, X); | |
else | |
printf("W = 0 (computes all LMSPs)\""); | |
printf("\n}\n"); | |
printf("s {\n \"%s\" 1 %d\n \"%s\" 1 %d\n}\n", | |
argv[1], len1, argv[2], len2); | |
printf("k {\n \"significance\"\n}\n"); | |
} | |
if (W > 0) { | |
bld_table(); | |
search(); | |
} else | |
get_msps(); | |
sort_msps(); | |
if (!alignments) | |
printf("\n\n Seq. 1 Seq. 2 len score signif W X\n"); | |
/* print the matches for each sequence */ | |
for (i = 0; i < numMSPs; ++i) | |
print_msp(i); | |
if (W == 0 && print_stats) { | |
printf("\n%3d MSPs above significance level 0.99\n", sig99); | |
printf("%3d MSPs above significance level 0.90\n", sig90); | |
printf("%3d MSPs above significance level 0.50\n", sig50); | |
printf("\nPercent of MSPs missed by BLAST at various"); | |
printf(" W and significance levels:\n"); | |
printf(" W: 0.99 0.90 0.50\n"); | |
for (i = 4; i <= 12; ++i) | |
printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n", | |
i, | |
100*missW[i][0]/((float)sig99), | |
100*missW[i][1]/((float)sig90), | |
100*missW[i][2]/((float)sig50)); | |
printf("\nPercent of MSPs missed by BLAST at various"); | |
printf(" X and significance levels:\n"); | |
printf(" X: 0.99 0.90 0.50\n"); | |
for (i = 1; i <= 20; ++i) | |
printf("%2d: %4.1f%% %4.1f%% %4.1f%%\n", | |
i, | |
100*missX[i][0]/((float)sig99), | |
100*missX[i][1]/((float)sig90), | |
100*missX[i][2]/((float)sig50)); | |
} | |
if (W > 0 && print_stats) { | |
printf("\n\nStatistics:\n"); | |
printf(" %s: M = %d, I = %d, V = %d, K = %d, W = %d, X = %d\n", | |
argv[0], M, I, V, K, W, X); | |
printf(" # of word hits = %d\n", numhits); | |
printf(" # of matches found = %d\n", numMSPs); | |
} | |
if (strcmp(argv[1], argv[2]) == 0) { | |
/* self-comparison; insert trivial diagonal */ | |
printf("a {\n s 0.0\n b 1 1\n e %d %d\n", len1, len1); | |
printf(" l 1 1 %d %d 0.0\n}\n", len1, len1); | |
} | |
exit (0); | |
} | |
/* get_seq - read a sequence */ | |
int get_seq(file_name, seqptr) | |
char *file_name, **seqptr; | |
{ | |
FILE *qp, *ckopen(); | |
char *p, *fgets(), *strchr(), *ckalloc(); | |
int c; | |
long n; | |
qp = ckopen(file_name, "r"); | |
for (n = 0; (c = getc(qp)) != EOF; ) | |
if (c != '\n') | |
++n; | |
*seqptr = ckalloc(n+1); | |
rewind(qp); | |
if (n == 0) | |
return 0; | |
for (p = *seqptr; ; ) { | |
if (fgets(p, (int)n+1, qp) == NULL) { | |
fclose(qp); | |
*p = '\0'; | |
break; | |
} | |
if (p == *seqptr && strchr("ACTG", *p) == NULL) | |
continue; | |
while (isupper(*p)) | |
++p; | |
if (*p != '\n' && *p != '\0') | |
fatalf("Illegal character %c in query sequence.", *p); | |
} | |
return p - *seqptr; | |
} | |
/* ------------- build table of W-tuples in the first sequence ------------*/ | |
/* The data structure could be simplified if we limited W to, say, W <= 8. */ | |
struct hash_node { | |
long ecode; /* integer encoding of the word */ | |
int pos; /* positions where word hits query sequence */ | |
struct hash_node *link; /* next word with same last 7.5 letters */ | |
}; | |
#define HASH_SIZE 32767 /* 2**15 - 1 */ | |
struct hash_node *hashtab[HASH_SIZE+1]; | |
int encoding[128]; | |
int mask; | |
long *next_pos; | |
bld_table() | |
{ | |
long ecode; | |
int i; | |
char *q, *ckalloc(); | |
/* perform initializations */ | |
encoding['C'] = 1; | |
encoding['G'] = 2; | |
encoding['T'] = 3; | |
mask = (1 << (W+W-2)) - 1; | |
next_pos = (long *)ckalloc(len1*sizeof(long)); | |
q = seq1 - 1; | |
ecode = 0L; | |
for (i = 1; i < W; ++i) | |
ecode = (ecode << 2) + encoding[*++q]; | |
while (*++q) { | |
ecode = ((ecode & mask) << 2) + encoding[*q]; | |
add_word(ecode, (long)(q - seq1 +1)); | |
} | |
} | |
/* add_word - add a word to the table of critical words */ | |
add_word(ecode, pos) | |
long ecode, pos; | |
{ | |
struct hash_node *h; | |
long hval; | |
char *ckalloc(); | |
hval = ecode & HASH_SIZE; | |
for (h = hashtab[hval]; h; h = h->link) | |
if (h->ecode == ecode) | |
break; | |
if (h == NULL) { | |
h = (struct hash_node *) ckalloc (sizeof(struct hash_node)); | |
h->link = hashtab[hval]; | |
hashtab[hval] = h; | |
h->ecode = ecode; | |
h->pos = -1; | |
} | |
next_pos[pos] = h->pos; | |
h->pos = pos; | |
} | |
/* ----------------------- search the second sequence --------------------*/ | |
search() | |
{ | |
register struct hash_node *h; | |
register char *s; | |
register long ecode, hval; | |
int i; | |
long p; | |
s = seq2 - 1; | |
ecode = 0L; | |
for (i = 1; i < W; ++i) | |
ecode = (ecode << 2) + encoding[*++s]; | |
while (*++s) { | |
ecode = ((ecode & mask) << 2) + encoding[*s]; | |
hval = ecode & HASH_SIZE; | |
for (h = hashtab[hval]; h; h = h->link) | |
if (h->ecode == ecode) { | |
for (p = h->pos; p >= 0; p = next_pos[p]) | |
extend_hit(p, (long)(s+1-seq2)); | |
break; | |
} | |
} | |
} | |
/* extend_hit - extend a word-sized hit to a longer match */ | |
extend_hit(pos1, pos2) | |
long pos1, pos2; | |
{ | |
char *beg2, *beg1, *end1, *q, *s; | |
long min_sum, max_sum, sum, diag, score; | |
Msp_ptr mp; | |
if (seq1 == seq2 && pos1 >= pos2) | |
return; | |
numhits++; | |
diag = pos2 - pos1; | |
if (diag_lev[diag] > pos1) | |
return; | |
/* extend to the right */ | |
max_sum = sum = 0; | |
end1 = q = seq1 + pos1; | |
s = seq2 + pos2; | |
while (*s != '\0' && *q != '\0' && sum >= max_sum - X) { | |
sum += SUBSTITUTION_SCORE(*s++,*q++); | |
if (sum > max_sum) { | |
max_sum = sum; | |
end1 = q; | |
} | |
} | |
/* extend to the left */ | |
min_sum = sum = 0; | |
beg1 = q = (seq1 + pos1) - W; | |
beg2 = s = seq2 + pos2 - W; | |
while (s > seq2 && q > seq1 && sum >= min_sum - X) { | |
sum += SUBSTITUTION_SCORE(*--s,*--q); | |
if (sum > min_sum) { | |
min_sum = sum; | |
beg2 = s; | |
beg1 = q; | |
} | |
} | |
score = M*W + max_sum + min_sum; | |
if (score >= K) { | |
++numMSPs; | |
mp = (Msp_ptr)ckalloc(sizeof(Msp)); | |
mp->len = end1 - beg1; | |
mp->pos1 = beg1 - seq1; | |
mp->pos2 = beg2 - seq2; | |
mp->score = score; | |
mp->next_msp = msp_list; | |
msp_list = mp; | |
} | |
diag_lev[diag] = (end1 - seq1) + W; | |
} | |
/* ---------------- compute locally maximal sequence pairs ---------------*/ | |
get_msps() | |
{ | |
long d, i, score, best_score, end1, beg1, lower, upper; | |
for (d = 1-len1; d < len2; ++d) { | |
beg1 = best_score = score = 0; | |
lower = max(1, 1-d); | |
upper = min(len1, len2-d); | |
for (i = lower; i <= upper; ++i) | |
if (score > 0) { | |
score += SUBSTITUTION_SCORE(s1[i], s2[i+d]); | |
if (score > best_score) { | |
best_score = score; | |
end1 = i; | |
} | |
} else { | |
check_score(best_score, d, beg1, end1); | |
best_score = score = | |
SUBSTITUTION_SCORE(s1[i], s2[i+d]); | |
beg1 = end1 = i; | |
} | |
check_score(best_score, d, beg1, end1); | |
} | |
} | |
check_score(score, d, beg1, end1) | |
long score, d, beg1, end1; | |
{ | |
char *ckalloc(); | |
Msp_ptr mp; | |
if (score >= K) { | |
++numMSPs; | |
mp = (Msp_ptr)ckalloc(sizeof(Msp)); | |
mp->len = end1 - beg1 + 1; | |
mp->pos1 = beg1 - 1; | |
mp->pos2 = beg1 + d - 1; | |
mp->score = score; | |
mp->next_msp = msp_list; | |
msp_list = mp; | |
} | |
} | |
/* sort_msps - order database sequence for printing */ | |
sort_msps() | |
{ | |
char *ckalloc(); | |
int i; | |
Msp_ptr mp; | |
msp = (Msp_ptr *) ckalloc(numMSPs*sizeof(Msp_ptr)); | |
for (i = 0, mp = msp_list; i < numMSPs; ++i, mp = mp->next_msp) | |
msp[i] = mp; | |
for (i = (numMSPs/2) - 1; i >= 0; --i) | |
heapify(i, (int) numMSPs-1); | |
for (i = numMSPs-1; i > 0; --i) { | |
mp = msp[0]; | |
msp[0] = msp[i]; | |
msp[i] = mp; | |
if (i > 1) | |
heapify(0, i-1); | |
} | |
} | |
/* heapify - core procedure for heapsort */ | |
heapify(i, last) | |
int i, last; | |
{ | |
int lim = (last-1)/2, left_son, small_son; | |
Msp_ptr mp; | |
while (i <= lim) { | |
left_son = 2*i + 1; | |
if (left_son == last) | |
small_son = left_son; | |
else | |
small_son = smaller(left_son, left_son+1); | |
if (smaller(i, small_son) == small_son) { | |
mp = msp[i]; | |
msp[i] = msp[small_son]; | |
msp[small_son] = mp; | |
i = small_son; | |
} else | |
break; | |
} | |
} | |
/* smaller - determine which of two sequences should be printed first */ | |
int smaller(i, j) | |
int i, j; | |
{ | |
Msp_ptr ki = msp[i], kj = msp[j]; | |
if (ki->score == kj->score) | |
return ((ki->pos1 >= kj->pos1) ? i : j); | |
/* print one with larger score first */ | |
return ((ki->score <= kj->score) ? i : j); | |
} | |
/* -------------------------- compute significance parameters ---------------*/ | |
get_params() | |
{ | |
double _p[1000], *p , log(), sqrt(), pMatch, pTransit, pTransver, N; | |
long A1, A2, C1, C2, G1, G2, T1, T2; | |
int c, i; | |
A1 = A2 = C1 = C2 = G1 = G2 = T1 = T2 = 0; | |
for (i = 0; i < len1; ++i) | |
if ((c = seq1[i]) == 'A') | |
++A1; | |
else if (c == 'C') | |
++C1; | |
else if (c == 'G') | |
++G1; | |
else if (c == 'T') | |
++T1; | |
for (i = 0; i < len2; ++i) | |
if ((c = seq2[i]) == 'A') | |
++A2; | |
else if (c == 'C') | |
++C2; | |
else if (c == 'G') | |
++G2; | |
else if (c == 'T') | |
++T2; | |
N = ((double)len1)*((double)len2); | |
pMatch = ((double)A1)*((double)A2) | |
+ ((double)C1)*((double)C2) | |
+ ((double)G1)*((double)G2) | |
+ ((double)T1)*((double)T2); | |
pTransit = ((double)A1)*((double)G2) | |
+ ((double)C1)*((double)T2) | |
+ ((double)G1)*((double)A2) | |
+ ((double)T1)*((double)C2); | |
pTransver = N - pMatch - pTransit; | |
for (i = 0; i < V+M; ++i) | |
_p[i] = 0.0; | |
p = _p - V; | |
p[V] = pTransver/N; | |
p[I] += (pTransit/N); | |
p[M] += (pMatch/N); | |
if (karlin(V, M, _p, ¶m_lambda, ¶m_K) == 0) | |
fatal("compution of significance levels failed"); | |
} | |
/**************** Statistical Significance Parameter Subroutine **************** | |
Version 1.0 February 2, 1990 | |
Version 1.1 July 5, 1990 | |
Program by: Stephen Altschul | |
Address: National Center for Biotechnology Information | |
National Library of Medicine | |
National Institutes of Health | |
Bethesda, MD 20894 | |
Internet: [email protected] | |
See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical | |
Significance of Molecular Sequence Features by Using General Scoring | |
Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268. | |
Computes the parameters lambda and K for use in calculating the | |
statistical significance of high-scoring segments or subalignments. | |
The scoring scheme must be integer valued. A positive score must be | |
possible, but the expected (mean) score must be negative. | |
A program that calls this routine must provide the value of the lowest | |
possible score, the value of the greatest possible score, and a pointer | |
to an array of probabilities for the occurence of all scores between | |
these two extreme scores. For example, if score -2 occurs with | |
probability 0.7, score 0 occurs with probability 0.1, and score 3 | |
occurs with probability 0.2, then the subroutine must be called with | |
low = -2, high = 3, and pr pointing to the array of values | |
{ 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide | |
pointers to lambda and K; the subroutine will then calculate the values | |
of these two parameters. In this example, lambda=0.330 and K=0.154. | |
The parameters lambda and K can be used as follows. Suppose we are | |
given a length N random sequence of independent letters. Associated | |
with each letter is a score, and the probabilities of the letters | |
determine the probability for each score. Let S be the aggregate score | |
of the highest scoring contiguous segment of this sequence. Then if N | |
is sufficiently large (greater than 100), the following bound on the | |
probability that S is greater than or equal to x applies: | |
P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ]. | |
In other words, the p-value for this segment can be written as | |
1-exp[-KN*exp(-lambda*S)]. | |
This formula can be applied to pairwise sequence comparison by assigning | |
scores to pairs of letters (e.g. amino acids), and by replacing N in the | |
formula with N*M, where N and M are the lengths of the two sequences | |
being compared. | |
In addition, letting y = KN*exp(-lambda*S), the p-value for finding m | |
distinct segments all with score >= S is given by: | |
2 m-1 -y | |
1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e | |
Notice that for m=1 this formula reduces to 1-exp(-y), which is the same | |
as the previous formula. | |
*******************************************************************************/ | |
#define MAXIT 150 /* Maximum number of iterations used in calculating K */ | |
karlin(low,high,pr,lambda,K) | |
int low; /* Lowest score (must be negative) */ | |
int high; /* Highest score (must be positive) */ | |
double *pr; /* Probabilities for various scores */ | |
double *lambda; /* Pointer to parameter lambda */ | |
double *K; /* Pointer to parmeter K */ | |
{ | |
int i,j,range,lo,hi,first,last; | |
double up,new,Sum,av; | |
register double sum; | |
double *p,*P,*ptrP,exp(); | |
register double *ptr1,*ptr2,*ptr1e; | |
char *calloc(); | |
/* Check that scores and their associated probabilities are valid */ | |
if (low>=0) { | |
fprintf(stderr,"Lowest score must be negative.\n"); | |
return 0; | |
} | |
for (i=range=high-low;i> -low && !pr[i];--i); | |
if (i<= -low) { | |
fprintf(stderr,"A positive score must be possible.\n"); | |
return 0; | |
} | |
for (sum=i=0;i<=range;sum+=pr[i++]) if (pr[i]<0) { | |
fprintf(stderr,"Negative probabilities not allowed.\n"); | |
return 0; | |
} | |
if (sum<0.99995 || sum>1.00005) fprintf(stderr,"Probabilities sum to %.4f. Normalizing.\n",sum); | |
p= (double *) calloc(range+1,sizeof(double)); | |
for (Sum=low,i=0;i<=range;++i) Sum+=i*(p[i]=pr[i]/sum); | |
if (Sum>=0) { | |
fprintf(stderr,"Invalid (non-negative) expected score: %.3f\n",Sum); | |
free(p); | |
return 0; | |
} | |
/* Calculate the parameter lambda */ | |
up=0.5; | |
do { | |
up*=2; | |
ptr1=p; | |
for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(up*i); | |
} | |
while (sum<1.0); | |
for (*lambda=j=0;j<25;++j) { | |
new=(*lambda+up)/2.0; | |
ptr1=p; | |
for (sum=0,i=low;i<=high;++i) sum+= *ptr1++ * exp(new*i); | |
if (sum>1.0) up=new; | |
else *lambda=new; | |
} | |
/* Calculate the pamameter K */ | |
ptr1=p; | |
for (av=0,i=low;i<=high;++i) av+= *ptr1++ *i*exp(*lambda*i); | |
if (low == -1 || high == 1) { | |
*K = high==1 ? av : Sum*Sum/av; | |
*K *= 1.0 - exp(- *lambda); | |
free(p); | |
return 1; /* Parameters calculated successfully */ | |
} | |
Sum=lo=hi=0; | |
P= (double *) calloc(MAXIT*range+1,sizeof(double)); | |
for (*P=sum=j=1;j<=MAXIT && sum>0.00001;Sum+=sum/=j++) { | |
first=last=range; | |
for (ptrP=P+(hi+=high)-(lo+=low);ptrP>=P;*ptrP-- =sum) { | |
ptr1=ptrP-first; | |
ptr1e=ptrP-last; | |
ptr2=p+first; | |
for (sum=0;ptr1>=ptr1e;) sum+= *ptr1-- * *ptr2++; | |
if (first) --first; | |
if (ptrP-P<=range) --last; | |
} | |
for (sum=0,i=lo;i;++i) sum+= *++ptrP * exp(*lambda*i); | |
for (;i<=hi;++i) sum+= *++ptrP; | |
} | |
if (j>MAXIT) fprintf(stderr,"Value for K may be too large due to insufficient iterations.\n"); | |
for (i=low;!p[i-low];++i); | |
for (j= -i;i<high && j>1;) if (p[++i-low]) j=gcd(j,i); | |
*K = (j*exp(-2*Sum))/(av*(1.0-exp(- *lambda*j))); | |
free(p); | |
free(P); | |
return 1; /* Parameters calculated successfully */ | |
} | |
gcd(a,b) | |
int a,b; | |
{ | |
int c; | |
if (b<0) b= -b; | |
if (b>a) { c=a; a=b; b=c; } | |
for (;b;b=c) { c=a%b; a=b; } | |
return a; | |
} | |
/* ---------------------------------------------------------------------------*/ | |
/* print_msp - display the i-th MSP */ | |
print_msp(i) | |
int i; | |
{ | |
long len, pos1, pos2; | |
Msp_ptr mp = msp[i]; | |
int j, start_j, W, X, sc; | |
double prob, significance(); | |
pos1 = mp->pos1; | |
pos2 = mp->pos2; | |
len = mp->len; | |
prob = significance(mp->score); | |
if (!alignments || print_stats) { | |
if (prob > .99) | |
++sig99; | |
if (prob > .90) | |
++sig90; | |
if (prob > .50) | |
++sig50; | |
for (W = start_j = j = 0; j < len; ++j) | |
if (seq1[pos1+j] != seq2[pos2+j]) { | |
W = max(W, j-start_j); | |
start_j = j+1; | |
} | |
W = max(W, len-start_j); | |
for (j = 4; j <= 12; ++j) { | |
if (prob > .99 && W < j) | |
++missW[j][0]; | |
if (prob > .90 && W < j) | |
++missW[j][1]; | |
if (prob > .50 && W < j) | |
++missW[j][2]; | |
} | |
for (sc = X = j = 0; j < len; ++j) | |
if (sc < 0) { | |
sc += SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]); | |
if (sc < X) | |
X = sc; | |
} else | |
sc = SUBSTITUTION_SCORE(seq1[pos1+j], seq2[pos2+j]); | |
X = -X; | |
for (j = 1; j <= 20; ++j) { | |
if (prob > .99 && X > j) | |
++missX[j][0]; | |
if (prob > .90 && X > j) | |
++missX[j][1]; | |
if (prob > .50 && X > j) | |
++missX[j][2]; | |
} | |
} | |
if (!alignments) { | |
printf("%4d: %6d %7d %5d %6d %5.1lf%% %4d %4d\n", | |
i+1, pos1+1, pos2+1, len, mp->score, 100*prob, W, X); | |
return; | |
} | |
printf("a {\n s %d\n b %d %d\n e %d %d\n l %d %d %d %d %1.1lf\n}\n", | |
mp->score, pos1+1, pos2+1, pos1+len, pos2+len, | |
pos1+1, pos2+1, pos1+len, pos2+len, 100*prob); | |
} | |
double significance(score) | |
int score; | |
{ | |
double y, exp(), N; | |
N = ((double)len1)*((double)len2); | |
y = -param_lambda*(score); | |
y = param_K*N*exp(y); | |
return exp(-y); | |
} | |
/* ---------------------------- utility routines -------------------------*/ | |
/* fatal - print message and die */ | |
fatal(msg) | |
char *msg; | |
{ | |
fprintf(stderr, "%s\n", msg); | |
exit(1); | |
} | |
/* fatalf - format message, print it, and die */ | |
fatalf(msg, val) | |
char *msg, *val; | |
{ | |
fprintf(stderr, msg, val); | |
putc('\n', stderr); | |
exit(1); | |
} | |
/* ckopen - open file; check for success */ | |
FILE *ckopen(name, mode) | |
char *name, *mode; | |
{ | |
FILE *fopen(), *fp; | |
if ((fp = fopen(name, mode)) == NULL) | |
fatalf("Cannot open %s.", name); | |
return(fp); | |
} | |
/* ckalloc - allocate space; check for success */ | |
char *ckalloc(amount) | |
int amount; | |
{ | |
char *malloc(), *p; | |
if ((p = malloc( (unsigned) amount)) == NULL) | |
fatal("Ran out of memory."); | |
return(p); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment