core_algorithms.c
来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,965 行 · 第 1/5 页
C
1,965 行
sc = imx[i+1][k] - hmm->isc[dsq[i+1]][k]; if (sc <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; } else if (sc == mmx[i][k] + hmm->tsc[TMI][k]) { tr->statetype[tpos] = STM; tr->nodeidx[tpos] = k--; tr->pos[tpos] = i--; } else if (sc == imx[i][k] + hmm->tsc[TII][k]) { tr->statetype[tpos] = STI; tr->nodeidx[tpos] = k; tr->pos[tpos] = i--; } else Die("traceback failed"); break; case STN: /* N connects from S, N */ if (i == 0 && xmx[i][XMN] == 0) { tr->statetype[tpos] = STS; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; } else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP]) { tr->statetype[tpos] = STN; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; /* note convention adherence: */ tr->pos[tpos-1] = i--; /* first N doesn't emit */ } else Die("traceback failed"); break; case STB: /* B connects from N, J */ if (xmx[i][XMB] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; } else if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE]) { tr->statetype[tpos] = STN; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; } else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) { tr->statetype[tpos] = STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; } else Die("traceback failed"); break; case STE: /* E connects from any M state. k set here */ if (xmx[i][XME] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; } for (k = hmm->M; k >= 1; k--) if (xmx[i][XME] == mmx[i][k] + hmm->esc[k]) { /* check for wing unfolding */ if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k]) { int dk; /* need a tmp k while moving thru delete wing */ for (dk = hmm->M; dk > k; dk--) { tr->statetype[tpos] = STD; tr->nodeidx[tpos] = dk; tr->pos[tpos] = 0; tpos++; if (tpos == curralloc) { /* grow trace if necessary */ curralloc += N; P7ReallocTrace(tr, curralloc); } } } tr->statetype[tpos] = STM; tr->nodeidx[tpos] = k--; tr->pos[tpos] = i--; break; } if (k < 0) Die("traceback failed"); break; case STC: /* C comes from C, E */ if (xmx[i][XMC] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; } else if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) { tr->statetype[tpos] = STC; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; /* note convention adherence: */ tr->pos[tpos-1] = i--; /* first C doesn't emit */ } else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE]) { tr->statetype[tpos] = STE; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; /* E is a nonemitter */ } else Die("Traceback failed."); break; case STJ: /* J connects from E, J */ if (xmx[i][XMJ] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; } else if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) { tr->statetype[tpos] = STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; /* note convention adherence: */ tr->pos[tpos-1] = i--; /* first J doesn't emit */ } else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP]) { tr->statetype[tpos] = STE; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; /* E is a nonemitter */ } else Die("Traceback failed."); break; default: Die("traceback failed"); } /* end switch over statetype[tpos-1] */ tpos++; if (tpos == curralloc) { /* grow trace if necessary */ curralloc += N; P7ReallocTrace(tr, curralloc); } } /* end traceback, at S state; tpos == tlen now */ tr->tlen = tpos; P7ReverseTrace(tr); *ret_tr = tr;}/* Function: P7SmallViterbi() * Date: SRE, Fri Mar 6 15:29:41 1998 [St. Louis] * * Purpose: Wrapper function, for linear memory alignment * with same arguments as P7Viterbi(). * * Calls P7ParsingViterbi to break the sequence * into fragments. Then, based on size of fragments, * calls either P7Viterbi() or P7WeeViterbi() to * get traces for them. Finally, assembles all these * traces together to produce an overall optimal * trace for the sequence. * * If the trace isn't needed for some reason, * all we do is call P7ParsingViterbi. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * mx - DP matrix, growable * ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Returns: Score of optimal alignment in bits. */floatP7SmallViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr){ struct p7trace_s *ctr; /* collapsed trace of optimal parse */ struct p7trace_s *tr; /* full trace of optimal alignment */ struct p7trace_s **tarr; /* trace array */ int ndom; /* number of subsequences */ int i; /* counter over domains */ int pos; /* position in sequence */ int tpos; /* position in trace */ int tlen; /* length of full trace */ int sqlen; /* length of a subsequence */ int totlen; /* length of L matched by model (as opposed to N/C/J) */ float sc; /* score of optimal alignment */ int t2; /* position in a subtrace */ /* Step 1. Call P7ParsingViterbi to calculate an optimal parse * of the sequence into single-hit subsequences; this parse * is returned in a "collapsed" trace */ sc = P7ParsingViterbi(dsq, L, hmm, &ctr); /* If we don't want full trace, we're done; * also, if parsing viterbi returned a NULL trace we're done. */ if (ctr == NULL || ret_tr == NULL) { P7FreeTrace(ctr); return sc; } /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence * to recover a full traceback of each, collecting them in * an array. */ ndom = ctr->tlen/2 - 1; tarr = MallocOrDie(sizeof(struct p7trace_s *) * ndom); tlen = totlen = 0; for (i = 0; i < ndom; i++) { sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */ if (P7ViterbiSpaceOK(sqlen, hmm->M, mx)) { SQD_DPRINTF1((" -- using P7Viterbi on an %dx%d subproblem\n", hmm->M, sqlen)); P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, mx, &(tarr[i])); } else { SQD_DPRINTF1((" -- using P7WeeViterbi on an %dx%d subproblem\n", hmm->M, sqlen)); P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i])); } tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */ totlen += sqlen; } /* Step 3. Compose the subtraces into one big final trace. * This is wasteful because we're going to TraceDecompose() * it again in both hmmsearch and hmmpfam to look at * individual domains; but we do it anyway so the P7SmallViterbi * interface looks exactly like the P7Viterbi interface. Maybe * long traces shouldn't include all the N/J/C states anyway, * since they're unambiguously implied. */ /* Calculate total trace len and alloc; * nonemitting SNCT + nonemitting J's + emitting NJC */ tlen += 4 + (ndom-1) + (L-totlen); P7AllocTrace(tlen, &tr); tr->tlen = tlen; /* Add N-terminal trace framework */ tr->statetype[0] = STS; tr->nodeidx[0] = 0; tr->pos[0] = 0; tr->statetype[1] = STN; tr->nodeidx[1] = 0; tr->pos[1] = 0; tpos = 2; /* add implied N's */ for (pos = 1; pos <= ctr->pos[1]; pos++) { tr->statetype[tpos] = STN; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } /* Add each subseq trace in, with its appropriate * sequence offset derived from the collapsed trace */ for (i = 0; i < ndom; i++) { /* skip SN, CT framework at ends */ for (t2 = 2; t2 < tarr[i]->tlen-2; t2++) { tr->statetype[tpos] = tarr[i]->statetype[t2]; tr->nodeidx[tpos] = tarr[i]->nodeidx[t2]; if (tarr[i]->pos[t2] > 0) tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1]; else tr->pos[tpos] = 0; tpos++; } /* add nonemitting J or C */ tr->statetype[tpos] = (i == ndom-1) ? STC : STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; tpos++; /* add implied emitting J's */ if (i != ndom-1) for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++) { tr->statetype[tpos] = STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } } /* add implied C's */ for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++) { tr->statetype[tpos] = STC; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } /* add terminal T */ tr->statetype[tpos] = STT; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; tpos++; for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]); free(tarr); P7FreeTrace(ctr); *ret_tr = tr; return sc;}/* Function: P7ParsingViterbi() * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis] * * Purpose: The "hmmfs" linear-memory algorithm for finding * the optimal alignment of a very long sequence to * a looping, multihit (e.g. Plan7) model, parsing it into * a series of nonoverlapping subsequences that match * the model once. Other algorithms (e.g. P7Viterbi() * or P7WeeViterbi()) are applied subsequently to * these subsequences to recover complete alignments. * * The hmmfs algorithm appears briefly in [Durbin98], * but is otherwise unpublished. * * The traceback structure returned is special: a * "collapsed" trace S->B->E->...->B->E->T, where * stateidx is unused and pos is used to indicate the * position of B and E in the sequence. The matched * subsequence is B_pos+1...E_pos. The number of * matches in the trace is (tlen/2)-1. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model (log odds scores ready) * ret_tr - RETURN: a collapsed traceback. * * Returns: Score of the optimal Viterbi alignment, in bits. */floatP7ParsingViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){ struct dpmatrix_s *mx; /* two rows of score matrix */ struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */ struct p7trace_s *tr; /* RETURN: collapsed traceback */ int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */ int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */ int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */ int sc; /* integer score of optimal alignment */ int i,k,tpos; /* index for seq, model, trace position */ int cur, prv; /* indices for rolling dp matrix */ int curralloc; /* size of allocation for tr */ /* Alloc a DP matrix and traceback pointers, two rows each, O(M). * Alloc two O(L) arrays to trace back through the sequence thru B and E. */ mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr); btr = MallocOrDie(sizeof(int) * (L+1)); etr = MallocOrDie(sizeof(int) * (L+1)); /* Initialization of the zero row. */ xmx[0][XMN] = 0; /* S->N, p=1 */ xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ btr[0] = 0; xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ etr[0] = -1; for (k = 0; k <= hmm->M; k++) mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick. * Note some slightly wasteful boundary conditions: * tsc[0] = -INFTY for all eight transitions (no node 0) * D_M and I_M are wastefully calculated (they don't exist) * * Notes on traceback pointer propagation. * - In the path B->E, we propagate the i that B was aligned to in the optimal * alignment, via mtr, dtr, and itr. * - When we reach an E, we record the i of the B it started from in etr. * - In a looping path E->J...->B or terminal path E->C...->T, we propagate * the i that E was aligned to in the optimal alignment via xtr[][XMC] * and xtr[][XMJ]. * - When we enter B, we record the i of the best previous E, or 0 if there * isn't one, in btr. */ for (i = 1; i <= L; i++) { cur = i % 2; prv = !cur; mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?