📄 blast_stat.c
字号:
for (iterCounter = 0; ((iterCounter < iterlimit) && (innerSum > sumlimit)); outerSum += innerSum /= ++iterCounter) { first = last = range; lowAlignmentScore += low; highAlignmentScore += high; /*dynamic program to compute P(i,j)*/ for (ptrP = alignmentScoreProbabilities + (highAlignmentScore-lowAlignmentScore); ptrP >= alignmentScoreProbabilities; *ptrP-- =innerSum) { ptr1 = ptrP - first; ptr1e = ptrP - last; ptr2 = probArrayStartLow + first; for (innerSum = 0.; ptr1 >= ptr1e; ) innerSum += *ptr1-- * *ptr2++; if (first) --first; if (ptrP - alignmentScoreProbabilities <= range) --last; } /* Horner's rule */ innerSum = *++ptrP; for( i = lowAlignmentScore + 1; i < 0; i++ ) { innerSum = *++ptrP + innerSum * expMinusLambda; } innerSum *= expMinusLambda; for (; i <= highAlignmentScore; ++i) innerSum += *++ptrP; oldsum2 = oldsum; oldsum = innerSum; }#ifdef ADD_GEOMETRIC_TERMS_TO_K /*old code assumed that the later terms in sum were asymptotically comparable to those of a geometric progression, and tried to speed up convergence by guessing the estimated ratio between sucessive terms and using the explicit terms of a geometric progression to speed up convergence. However, the assumption does not always hold, and convergenece of the above code is fast enough in practice*/ /* Terms of geometric progression added for correction */ { double ratio; /* fraction used to generate the geometric progression */ ratio = oldsum / oldsum2; if (ratio >= (1.0 - sumlimit*0.001)) { K = -1.; if (alignmentScoreProbabilities != NULL) sfree(alignmentScoreProbabilities); return K; } sumlimit *= 0.01; while (innerSum > sumlimit) { oldsum *= ratio; outerSum += innerSum = oldsum / ++iterCounter; } }#endif if (expMinusLambda < SMALL_LAMBDA_THRESHOLD ) { K = -exp((double)-2.0*outerSum) / (firstTermClosedForm*(expMinusLambda - 1.0)); } else { K = -exp((double)-2.0*outerSum) / (firstTermClosedForm*BLAST_Expm1(-(double)lambda)); } if (alignmentScoreProbabilities != NULL) sfree(alignmentScoreProbabilities); return K;}/** * Find positive solution to sum_{i=low}^{high} exp(i lambda) = 1. * * @param probs probabilities of a score occurring * @param d the gcd of the possible scores. This equals 1 if the scores * are not a lattice * @param low the lowest possible score * @param high the highest possible score * @param lambda0 an initial value for lambda * @param tolx the tolerance to which lambda must be computed * @param itmax the maximum number of times the function may be * evaluated * @param maxNewton the maximum permissible number of Newton * iteration. After that the computation will proceed by bisection. * @param itn a pointer to an integer that will receive the actually * number of iterations performed. * * Let phi(lambda) = sum_{i=low}^{high} exp(i lambda) - 1. Then phi(lambda) * may be written * * phi(lamdba) = exp(u lambda) p( exp(-lambda) ) * * where p(x) is a polynomial that has exactly two zeros, one at x = 1 * and one at y = exp(-lamdba). It is simpler, more numerically * efficient and stable to apply Newton's method to p(x) than to * phi(lambda). * * We define a safeguarded Newton iteration as follows. Let the * initial interval of uncertainty be [0,1]. If p'(x) >= 0, we bisect * the interval. Otherwise we try a Newton step. If the Newton iterate * lies in the current interval of uncertainty and it reduces the * value of | p(x) | by at least 10%, we accept the new * point. Otherwise, we bisect the current interval of uncertainty. * It is clear that this method converges to a zero of p(x). Since * p'(x) > 0 in an interval containing x = 1, the method cannot * converge to x = 1 and therefore converges to the only other zero, * y. */static double NlmKarlinLambdaNR( double* probs, Int4 d, Int4 low, Int4 high, double lambda0, double tolx, Int4 itmax, Int4 maxNewton, Int4 * itn ) { Int4 k; double x0, x, a = 0, b = 1; double f = 4; /* Larger than any possible value of the poly in [0,1] */ Int4 isNewton = 0; /* we haven't yet taken a Newton step. */ assert( d > 0 ); x0 = exp( -lambda0 ); x = ( 0 < x0 && x0 < 1 ) ? x0 : .5; for( k = 0; k < itmax; k++ ) { /* all iteration indices k */ Int4 i; double g, fold = f; Int4 wasNewton = isNewton; /* If true, then the previous step was a */ /* Newton step */ isNewton = 0; /* Assume that this step is not */ /* Horner's rule for evaluating a polynomial and its derivative */ g = 0; f = probs[low]; for( i = low + d; i < 0; i += d ) { g = x * g + f; f = f * x + probs[i]; } g = x * g + f; f = f * x + probs[0] - 1; for( i = d; i <= high; i += d ) { g = x * g + f; f = f * x + probs[i]; } /* End Horner's rule */ if( f > 0 ) { a = x; /* move the left endpoint */ } else if( f < 0 ) { b = x; /* move the right endpoint */ } else { /* f == 0 */ break; /* x is an exact solution */ } if( b - a < 2 * a * ( 1 - b ) * tolx ) { /* The midpoint of the interval converged */ x = (a + b) / 2; break; } if( k >= maxNewton || /* If convergence of Newton's method appears to be failing; or */ ( wasNewton && fabs( f ) > .9 * fabs(fold) ) || /* if the previous iteration was a Newton step but didn't decrease * f sufficiently; or */ g >= 0 /* if a Newton step will move us away from the desired solution */ ) { /* then */ /* bisect */ x = (a + b)/2; } else { /* try a Newton step */ double p = - f/g; double y = x + p; if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */ x = (a + b)/2; } else { /* The proposed iterate is in (a,b). Accept it. */ isNewton = 1; x = y; if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */ } /* else the proposed iterate is in (a,b) */ } /* else try a Newton step. */ } /* end for all iteration indices k */ *itn = k; return -log(x)/d;}doubleBlast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess){ Int4 low; /* Lowest score (must be negative) */ Int4 high; /* Highest score (must be positive) */ Int4 itn; Int4 i, d; double* sprob; double returnValue; low = sfp->obs_min; high = sfp->obs_max; if (sfp->score_avg >= 0.) { /* Expected score must be negative */ return -1.0; } if (BlastScoreChk(low, high) != 0) return -1.; sprob = sfp->sprob; /* Find greatest common divisor of all scores */ for (i = 1, d = -low; i <= high-low && d > 1; ++i) { if (sprob[i+low] != 0) { d = BLAST_Gcd(d, i); } } returnValue = NlmKarlinLambdaNR( sprob, d, low, high, initialLambdaGuess, BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT, 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn ); return returnValue;}/* BlastKarlinLtoH Calculate H, the relative entropy of the p's and q's*/static double BlastKarlinLtoH(Blast_ScoreFreq* sfp, double lambda){ Int4 score; double H, etonlam, sum, scale; double *probs = sfp->sprob; Int4 low = sfp->obs_min, high = sfp->obs_max; if (lambda < 0.) { return -1.; } if (BlastScoreChk(low, high) != 0) return -1.; etonlam = exp( - lambda ); sum = low * probs[low]; for( score = low + 1; score <= high; score++ ) { sum = score * probs[score] + etonlam * sum; } scale = BLAST_Powi( etonlam, high ); if( scale > 0 ) { H = lambda * sum/scale; } else { /* Underflow of exp( -lambda * high ) */ H = lambda * exp( lambda * high + log(sum) ); } return H;}/**************** Statistical Significance Parameter Subroutine **************** Version 1.0 February 2, 1990 Version 1.2 July 6, 1990 Program by: Stephen Altschul Address: National Center for Biotechnology Information National Library of Medicine National Institutes of Health Bethesda, MD 20894 Internet: altschul@ncbi.nlm.nih.govSee: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical Significance of Molecular Sequence Features by Using General Scoring Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268. Computes the parameters lambda and K for use in calculating the statistical significance of high-scoring segments or subalignments. The scoring scheme must be integer valued. A positive score must be possible, but the expected (mean) score must be negative. A program that calls this routine must provide the value of the lowest possible score, the value of the greatest possible score, and a pointer to an array of probabilities for the occurrence of all scores between these two extreme scores. For example, if score -2 occurs with probability 0.7, score 0 occurs with probability 0.1, and score 3 occurs with probability 0.2, then the subroutine must be called with low = -2, high = 3, and pr pointing to the array of values { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide pointers to lambda and K; the subroutine will then calculate the values of these two parameters. In this example, lambda=0.330 and K=0.154. The parameters lambda and K can be used as follows. Suppose we are given a length N random sequence of independent letters. Associated with each letter is a score, and the probabilities of the letters determine the probability for each score. Let S be the aggregate score of the highest scoring contiguous segment of this sequence. Then if N is sufficiently large (greater than 100), the following bound on the probability that S is greater than or equal to x applies: P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ]. In other words, the p-value for this segment can be written as 1-exp[-KN*exp(-lambda*S)]. This formula can be applied to pairwise sequence comparison by assigning scores to pairs of letters (e.g. amino acids), and by replacing N in the formula with N*M, where N and M are the lengths of the two sequences being compared. In addition, letting y = KN*exp(-lambda*S), the p-value for finding m distinct segments all with score >= S is given by: 2 m-1 -y 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e Notice that for m=1 this formula reduces to 1-exp(-y), which is the same as the previous formula.*******************************************************************************/Int2Blast_KarlinBlkCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp){ if (kbp == NULL || sfp == NULL) return 1; /* Calculate the parameter Lambda */ kbp->Lambda = Blast_KarlinLambdaNR(sfp, BLAST_KARLIN_LAMBDA0_DEFAULT); if (kbp->Lambda < 0.) goto ErrExit; /* Calculate H */ kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda); if (kbp->H < 0.) goto ErrExit; /* Calculate K and log(K) */ kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H); if (kbp->K < 0.) goto ErrExit; kbp->logK = log(kbp->K); /* Normal return */ return 0;ErrExit: kbp->Lambda = kbp->H = kbp->K = -1.;#ifdef BLASTKAR_HUGE_VAL kbp->logK = BLASTKAR_HUGE_VAL;#else kbp->logK = 1.e30;#endif return 1;}/* Calculate the Karlin parameters. This function should be called once for each context, or frame translated. The rfp and stdrfp are calculated for each context, this should be fixed. */Int2BLAST_ScoreBlkFill(BlastScoreBlk* sbp, char* query, Int4 query_length, Int4 context_number){ Blast_ResFreq* rfp,* stdrfp; Int2 retval=0; rfp = Blast_ResFreqNew(sbp); stdrfp = Blast_ResFreqNew(sbp); Blast_ResFreqStdComp(sbp, stdrfp); Blast_ResFreqString(sbp, rfp, query, query_length); sbp->sfp[context_number] = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore); BlastScoreFreqCalc(sbp, sbp->sfp[context_number], rfp, stdrfp); sbp->kbp_std[context_number] = Blast_KarlinBlkCreate(); retval = Blast_KarlinBlkCalc(sbp->kbp_std[context_number], sbp->sfp[context_number]); if (retval) { rfp = Blast_ResFreqDestruct(rfp); stdrfp = Blast_ResFreqDestruct(stdrfp); return retval; } sbp->kbp_psi[context_number] = Blast_KarlinBlkCreate(); retval = Blast_KarlinBlkCalc(sbp->kbp_psi[context_number], sbp->sfp[context_number]); rfp = Blast_ResFreqDestruct(rfp); stdrfp = Blast_ResFreqDestruct(stdrfp); return retval;}/* Calculates the standard Karlin parameters. This is used if the query is translated and the calculated (real) Karlin parameters are bad, as they're calculated for non-coding regions.*/Blast_KarlinBlk*Blast_KarlinBlkIdealCalc(BlastScoreBlk* sbp){ Blast_KarlinBlk* kbp_ideal; Blast_ResFreq* stdrfp; Blast_ScoreFreq* sfp; stdrfp = Blast_ResFreqNew(sbp); Blast_ResFreqStdComp(sbp, stdrfp); sfp = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore); BlastScoreFreqCalc(sbp, sfp, stdrfp, stdrfp); kbp_ideal = Blast_KarlinBlkCreate(); Blast_KarlinBlkCalc(kbp_ideal, sfp); stdrfp = Blast_ResFreqDestruct(stdrfp); sfp = Blast_ScoreFreqDestruct(sfp); return kbp_ideal;}Int2Blast_KarlinBlkStandardCalc(BlastScoreBlk* sbp, Int4 context_start, Int4 context_end){ Blast_KarlinBlk* kbp_ideal,* kbp; Int4 in
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -