📄 blast_stat.c
字号:
residues[index] = AMINOACID_TO_NCBISTDAA[toupper(STD_AMINO_ACID_FREQS[index].ch)]; } else { residues[index] = STD_AMINO_ACID_FREQS[index].ch; } } return index;}Int2Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp){ Int2 retval; Uint4 index; Uint1* residues; if (sbp->protein_alphabet == TRUE) { residues = (Uint1*) calloc(DIM(STD_AMINO_ACID_FREQS), sizeof(Uint1)); retval = Blast_GetStdAlphabet(sbp->alphabet_code, residues, DIM(STD_AMINO_ACID_FREQS)); if (retval < 1) return retval; for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++) { rfp->prob[residues[index]] = STD_AMINO_ACID_FREQS[index].p; } sfree(residues); } else { /* beginning of blastna and ncbi2na are the same. */ /* Only blastna used for nucleotides. */ for (index=0; index<DIM(nt_prob); index++) { rfp->prob[index] = nt_prob[index].p; } } Blast_ResFreqNormalize(sbp, rfp, 1.0); return 0;}typedef struct Blast_ResComp { Uint1 alphabet_code; Int4* comp; /* composition of alphabet, array starts at beginning of alphabet. */ Int4* comp0; /* Same array as above, starts at zero. */} Blast_ResComp;static Blast_ResComp*BlastResCompDestruct(Blast_ResComp* rcp){ if (rcp == NULL) return NULL; if (rcp->comp0 != NULL) sfree(rcp->comp0); sfree(rcp); return NULL;}/* Allocated the Blast_ResComp* for a given alphabet. Only the alphabets ncbistdaa and ncbi4na should be used by BLAST.*/static Blast_ResComp*BlastResCompNew(BlastScoreBlk* sbp){ Blast_ResComp* rcp; rcp = (Blast_ResComp*) calloc(1, sizeof(Blast_ResComp)); if (rcp == NULL) return NULL; rcp->alphabet_code = sbp->alphabet_code;/* comp0 has zero offset, comp starts at 0, only one array is allocated. */ rcp->comp0 = (Int4*) calloc(BLAST_MATRIX_SIZE, sizeof(Int4)); if (rcp->comp0 == NULL) { rcp = BlastResCompDestruct(rcp); return rcp; } rcp->comp = rcp->comp0 - sbp->alphabet_start; return rcp;}/* Store the composition of a (query) string. */static Int2BlastResCompStr(BlastScoreBlk* sbp, Blast_ResComp* rcp, char* str, Int4 length){ char* lp,* lpmax; Int2 index; Uint1 mask; if (sbp == NULL || rcp == NULL || str == NULL) return 1; if (rcp->alphabet_code != sbp->alphabet_code) return 1; /* For megablast, check only the first 4 bits of the sequence values */ mask = (sbp->protein_alphabet ? 0xff : 0x0f);/* comp0 starts at zero and extends for "num", comp is the same array, but "start_at" offset. */ for (index=0; index<(sbp->alphabet_size); index++) rcp->comp0[index] = 0; for (lp = str, lpmax = lp+length; lp < lpmax; lp++) { ++rcp->comp[(int)(*lp & mask)]; } /* Don't count ambig. residues. */ for (index=0; index<sbp->ambig_occupy; index++) { rcp->comp[sbp->ambiguous_res[index]] = 0; } return 0;}static Int2Blast_ResFreqClr(const BlastScoreBlk* sbp, Blast_ResFreq* rfp){ Int2 alphabet_max, index; alphabet_max = sbp->alphabet_start + sbp->alphabet_size; for (index=sbp->alphabet_start; index<alphabet_max; index++) rfp->prob[index] = 0.0; return 0;}/* Calculate the residue frequencies associated with the provided ResComp*/static Int2Blast_ResFreqResComp(BlastScoreBlk* sbp, Blast_ResFreq* rfp, Blast_ResComp* rcp){ Int2 alphabet_max, index; double sum = 0.; if (rfp == NULL || rcp == NULL) return 1; if (rfp->alphabet_code != rcp->alphabet_code) return 1; alphabet_max = sbp->alphabet_start + sbp->alphabet_size; for (index=sbp->alphabet_start; index<alphabet_max; index++) sum += rcp->comp[index]; if (sum == 0.) { Blast_ResFreqClr(sbp, rfp); return 0; } for (index=sbp->alphabet_start; index<alphabet_max; index++) rfp->prob[index] = rcp->comp[index] / sum; return 0;}static Int2Blast_ResFreqString(BlastScoreBlk* sbp, Blast_ResFreq* rfp, char* string, Int4 length){ Blast_ResComp* rcp; rcp = BlastResCompNew(sbp); BlastResCompStr(sbp, rcp, string, length); Blast_ResFreqResComp(sbp, rfp, rcp); rcp = BlastResCompDestruct(rcp); return 0;}static Int2BlastScoreChk(Int4 lo, Int4 hi){ if (lo >= 0 || hi <= 0 || lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX) return 1; if (hi - lo > BLAST_SCORE_RANGE_MAX) return 1; return 0;}Blast_ScoreFreq*Blast_ScoreFreqNew(Int4 score_min, Int4 score_max){ Blast_ScoreFreq* sfp; Int4 range; if (BlastScoreChk(score_min, score_max) != 0) return NULL; sfp = (Blast_ScoreFreq*) calloc(1, sizeof(Blast_ScoreFreq)); if (sfp == NULL) return NULL; range = score_max - score_min + 1; sfp->sprob = (double*) calloc(range, sizeof(double)); if (sfp->sprob == NULL) { Blast_ScoreFreqDestruct(sfp); return NULL; } sfp->sprob0 = sfp->sprob; sfp->sprob -= score_min; /* center around 0 */ sfp->score_min = score_min; sfp->score_max = score_max; sfp->obs_min = sfp->obs_max = 0; sfp->score_avg = 0.0; return sfp;}static Int2BlastScoreFreqCalc(BlastScoreBlk* sbp, Blast_ScoreFreq* sfp, Blast_ResFreq* rfp1, Blast_ResFreq* rfp2){ Int4 ** matrix; Int4 score, obs_min, obs_max; double score_sum, score_avg; Int2 alphabet_start, alphabet_end, index1, index2; if (sbp == NULL || sfp == NULL) return 1; if (sbp->loscore < sfp->score_min || sbp->hiscore > sfp->score_max) return 1; for (score = sfp->score_min; score <= sfp->score_max; score++) sfp->sprob[score] = 0.0; matrix = sbp->matrix; alphabet_start = sbp->alphabet_start; alphabet_end = alphabet_start + sbp->alphabet_size; for (index1=alphabet_start; index1<alphabet_end; index1++) { for (index2=alphabet_start; index2<alphabet_end; index2++) { score = matrix[index1][index2]; if (score >= sbp->loscore) { sfp->sprob[score] += rfp1->prob[index1] * rfp2->prob[index2]; } } } score_sum = 0.; obs_min = obs_max = BLAST_SCORE_MIN; for (score = sfp->score_min; score <= sfp->score_max; score++) { if (sfp->sprob[score] > 0.) { score_sum += sfp->sprob[score]; obs_max = score; if (obs_min == BLAST_SCORE_MIN) obs_min = score; } } sfp->obs_min = obs_min; sfp->obs_max = obs_max; score_avg = 0.0; if (score_sum > 0.0001 || score_sum < -0.0001) { for (score = obs_min; score <= obs_max; score++) { sfp->sprob[score] /= score_sum; score_avg += score * sfp->sprob[score]; } } sfp->score_avg = score_avg; return 0;}#define DIMOFP0 (iterlimit*range + 1)#define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)#define SMALL_LAMBDA_THRESHOLD 20 /*defines special case in K computation*/ /*threshold is on exp(-Lambda)*//*The following procedure computes K. The input includes Lambda, H, * and an array of probabilities for each score. * There are distinct closed form for three cases: * 1. high score is 1 low score is -1 * 2. high score is 1 low score is not -1 * 3. low score is -1, high score is not 1 * * Otherwise, in most cases the value is computed as: * -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1) * The last term (exp(-lambda) - 1) can be computed in two different * ways depending on whether lambda is small or not. * outerSum is a sum of the terms * innerSum/j, where j is denoted by iterCounter in the code. * The sum is truncated when the new term innersum/j i sufficiently small. * innerSum is a weighted sum of the probabilities of * of achieving a total score i in a gapless alignment, * which we denote by P(i,j). * of exactly j characters. innerSum(j) has two parts * Sum over i < 0 P(i,j)exp(-i * lambda) + * Sum over i >=0 P(i,j) * The terms P(i,j) are computed by dynamic programming. * An earlier version was flawed in that ignored the special case 1 * and tried to replace the tail of the computation of outerSum * by a geometric series, but the base of the geometric series * was not accurately estimated in some cases. */static doubleBlastKarlinLHtoK(Blast_ScoreFreq* sfp, double lambda, double H){ /*The next array stores the probabilities of getting each possible score in an alignment of fixed length; the array is shifted during part of the computation, so that entry 0 is for score 0. */ double *alignmentScoreProbabilities = NULL; Int4 low; /* Lowest score (must be negative) */ Int4 high; /* Highest score (must be positive) */ Int4 range; /* range of scores, computed as high - low*/ double K; /* local copy of K to return*/ int i; /*loop index*/ int iterCounter; /*counter on iterations*/ Int4 divisor; /*candidate divisor of all scores with non-zero probabilities*/ /*highest and lowest possible alignment scores for current length*/ Int4 lowAlignmentScore, highAlignmentScore; Int4 first, last; /*loop indices for dynamic program*/ register double innerSum; double oldsum, oldsum2; /* values of innerSum on previous iterations*/ double outerSum; /* holds sum over j of (innerSum for iteration j/j)*/ double score_avg; /*average score*/ /*first term to use in the closed form for the case where high == 1 or low == -1, but not both*/ double firstTermClosedForm; /*usually store H/lambda*/ int iterlimit; /*upper limit on iterations*/ double sumlimit; /*lower limit on contributions to sum over scores*/ /*array of score probabilities reindexed so that low is at index 0*/ double *probArrayStartLow; /*pointers used in dynamic program*/ double *ptrP, *ptr1, *ptr2, *ptr1e; double expMinusLambda; /*e^^(-Lambda) */ if (lambda <= 0. || H <= 0.) { /* Theory dictates that H and lambda must be positive, so * return -1 to indicate an error */ return -1.; } /*Karlin-Altschul theory works only if the expected score is negative*/ if (sfp->score_avg >= 0.0) { return -1.; } low = sfp->obs_min; high = sfp->obs_max; range = high - low; probArrayStartLow = &sfp->sprob[low]; /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of Karlin&Altschul (1990) */ for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) { if (probArrayStartLow[i]) divisor = BLAST_Gcd(divisor, i); } high /= divisor; low /= divisor; lambda *= divisor; range = high - low; firstTermClosedForm = H/lambda; expMinusLambda = exp((double) -lambda); if (low == -1 && high == 1) { K = (sfp->sprob[low] - sfp->sprob[high]) * (sfp->sprob[low] - sfp->sprob[high]) / sfp->sprob[low]; return(K); } if (low == -1 || high == 1) { if (high == 1) ; else { score_avg = sfp->score_avg / divisor; firstTermClosedForm = (score_avg * score_avg) / firstTermClosedForm; } return firstTermClosedForm * (1.0 - expMinusLambda); } sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT; iterlimit = BLAST_KARLIN_K_ITER_MAX; if (DIMOFP0 > DIMOFP0_MAX) { return -1.; } alignmentScoreProbabilities = (double *)calloc(DIMOFP0, sizeof(*alignmentScoreProbabilities)); if (alignmentScoreProbabilities == NULL) return -1.; outerSum = 0.; lowAlignmentScore = highAlignmentScore = 0; alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -