📄 core_algorithms.c
字号:
{ sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */ if (P7ViterbiSize(sqlen, hmm->M) > RAMLIMIT) P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i])); else P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i])); tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */ totlen += sqlen; } /* Step 3. Compose the subtraces into one big final trace. * This is wasteful because we're going to TraceDecompose() * it again in both hmmsearch and hmmpfam to look at * individual domains; but we do it anyway so the P7SmallViterbi * interface looks exactly like the P7Viterbi interface. Maybe * long traces shouldn't include all the N/J/C states anyway, * since they're unambiguously implied. */ /* Calculate total trace len and alloc; * nonemitting SNCT + nonemitting J's + emitting NJC */ tlen += 4 + (ndom-1) + (L-totlen); P7AllocTrace(tlen, &tr); tr->tlen = tlen; /* Add N-terminal trace framework */ tr->statetype[0] = STS; tr->nodeidx[0] = 0; tr->pos[0] = 0; tr->statetype[1] = STN; tr->nodeidx[1] = 0; tr->pos[1] = 0; tpos = 2; /* add implied N's */ for (pos = 1; pos <= ctr->pos[1]; pos++) { tr->statetype[tpos] = STN; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } /* Add each subseq trace in, with its appropriate * sequence offset derived from the collapsed trace */ for (i = 0; i < ndom; i++) { /* skip SN, CT framework at ends */ for (t2 = 2; t2 < tarr[i]->tlen-2; t2++) { tr->statetype[tpos] = tarr[i]->statetype[t2]; tr->nodeidx[tpos] = tarr[i]->nodeidx[t2]; if (tarr[i]->pos[t2] > 0) tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1]; else tr->pos[tpos] = 0; tpos++; } /* add nonemitting J or C */ tr->statetype[tpos] = (i == ndom-1) ? STC : STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; tpos++; /* add implied emitting J's */ if (i != ndom-1) for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++) { tr->statetype[tpos] = STJ; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } } /* add implied C's */ for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++) { tr->statetype[tpos] = STC; tr->nodeidx[tpos] = 0; tr->pos[tpos] = pos; tpos++; } /* add terminal T */ tr->statetype[tpos] = STT; tr->nodeidx[tpos] = 0; tr->pos[tpos] = 0; tpos++; for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]); free(tarr); P7FreeTrace(ctr); *ret_tr = tr; return sc;}/* Function: P7ParsingViterbi() * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis] * * Purpose: The "hmmfs" linear-memory algorithm for finding * the optimal alignment of a very long sequence to * a looping, multihit (e.g. Plan7) model, parsing it into * a series of nonoverlapping subsequences that match * the model once. Other algorithms (e.g. P7Viterbi() * or P7WeeViterbi()) are applied subsequently to * these subsequences to recover complete alignments. * * The hmmfs algorithm appears briefly in [Durbin98], * but is otherwise unpublished. * * The traceback structure returned is special: a * "collapsed" trace S->B->E->...->B->E->T, where * stateidx is unused and pos is used to indicate the * position of B and E in the sequence. The matched * subsequence is B_pos+1...E_pos. The number of * matches in the trace is (tlen/2)-1. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model (log odds scores ready) * ret_tr - RETURN: a collapsed traceback. * * Returns: Score of the optimal Viterbi alignment, in bits. */floatP7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){ struct dpmatrix_s *mx; /* two rows of score matrix */ struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */ struct p7trace_s *tr; /* RETURN: collapsed traceback */ int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */ int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */ int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */ int sc; /* integer score of optimal alignment */ int i,k,tpos; /* index for seq, model, trace position */ int cur, prv; /* indices for rolling dp matrix */ int curralloc; /* size of allocation for tr */ /* Alloc a DP matrix and traceback pointers, two rows each, O(M). * Alloc two O(L) arrays to trace back through the sequence thru B and E. */ mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr); btr = MallocOrDie(sizeof(int) * (L+1)); etr = MallocOrDie(sizeof(int) * (L+1)); /* Initialization of the zero row. */ xmx[0][XMN] = 0; /* S->N, p=1 */ xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ btr[0] = 0; xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ etr[0] = -1; for (k = 0; k <= hmm->M; k++) mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick. * Note some slightly wasteful boundary conditions: * tsc[0] = -INFTY for all eight transitions (no node 0) * D_M and I_M are wastefully calculated (they don't exist) * * Notes on traceback pointer propagation. * - In the path B->E, we propagate the i that B was aligned to in the optimal * alignment, via mtr, dtr, and itr. * - When we reach an E, we record the i of the B it started from in etr. * - In a looping path E->J...->B or terminal path E->C...->T, we propagate * the i that E was aligned to in the optimal alignment via xtr[][XMC] * and xtr[][XMJ]. * - When we enter B, we record the i of the best previous E, or 0 if there * isn't one, in btr. */ for (i = 1; i <= L; i++) { cur = i % 2; prv = !cur; mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY; for (k = 1; k <= hmm->M; k++) { /* match state */ mmx[cur][k] = -INFTY; if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY) { mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; } if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k]) { mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; } if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k]) { mmx[cur][k] = sc; mtr[cur][k] = i-1; } if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k]) { mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; } if (hmm->msc[(int) dsq[i]][k] != -INFTY) mmx[cur][k] += hmm->msc[(int) dsq[i]][k]; else mmx[cur][k] = -INFTY; /* delete state */ dmx[cur][k] = -INFTY; if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY) { dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; } if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k]) { dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; } /* insert state */ if (k < hmm->M) { imx[cur][k] = -INFTY; if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY) { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; } if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k]) { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; } if (hmm->isc[(int) dsq[i]][k] != -INFTY) imx[cur][k] += hmm->isc[(int) dsq[i]][k]; else imx[cur][k] = -INFTY; } } /* Now the special states. Order is important here. * remember, C and J emissions are zero score by definition, */ /* N state */ xmx[cur][XMN] = -INFTY; if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY) xmx[cur][XMN] = sc; /* E state */ xmx[cur][XME] = -INFTY; for (k = 1; k <= hmm->M; k++) if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME]) { xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; } /* J state */ xmx[cur][XMJ] = -INFTY; if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY) { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; } if ((sc = xmx[cur][XME] + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ]) { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; } /* B state */ xmx[cur][XMB] = -INFTY; if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) { xmx[cur][XMB] = sc; btr[i] = 0; } if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB]) { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; } /* C state */ xmx[cur][XMC] = -INFTY; if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; } if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC]) { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; } } /* T state (not stored) */ sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE]; /***************************************************************** * Collapsed traceback stage. * xtr[L%2][XMC] contains the position j of the previous E * etr[j] contains the position i of the previous B * btr[i] contains the position j of the previous E, or 0 * continue until btr[i] = 0. *****************************************************************/ curralloc = 2; /* minimum: no hits */ P7AllocTrace(curralloc, &tr); /* Init of collapsed trace. Back to front; we ReverseTrace() later. */ tpos = 0; tr->statetype[tpos] = STT; tr->pos[tpos] = 0; i = xtr[L%2][XMC]; while (i > 0) { curralloc += 2; P7ReallocTrace(tr, curralloc); tpos++; tr->statetype[tpos] = STE; tr->pos[tpos] = i; i = etr[i]; tpos++; tr->statetype[tpos] = STB; tr->pos[tpos] = i; i = btr[i]; } tpos++; tr->statetype[tpos] = STS; tr->pos[tpos] = 0; tr->tlen = tpos + 1; P7ReverseTrace(tr); FreePlan7Matrix(mx); FreePlan7Matrix(tmx); free(btr); free(etr); *ret_tr = tr; return Scorify(sc);}/* Function: P7WeeViterbi() * Date: SRE, Wed Mar 4 08:24:04 1998 [St. Louis] * * Purpose: Hirschberg/Myers/Miller linear memory alignment. * See [Hirschberg75,MyM-88a] for the idea of the algorithm. * Adapted to HMM implementation. * * Requires that you /know/ that there's only * one hit to the model in the sequence: either * because you're forcing single-hit, or you've * previously called P7ParsingViterbi to parse * the sequence into single-hit segments. The reason * for this is that a cyclic model (a la Plan7) * defeats the nice divide and conquer trick. * (I think some trickery with propagated trace pointers * could get around this but haven't explored it.) * This is implemented by ignoring transitions * to/from J state. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * ret_tr - RETURN: traceback. * * Returns: Score of the optimal Viterbi alignment. */floatP7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){ struct p7trace_s *tr; /* RETURN: traceback */ int *kassign; /* 0..L+1, alignment of seq positions to model nodes */ char *tassign; /* 0..L+1, alignment of seq positions to state types */ int *endlist; /* stack of end points on sequence to work on */ int *startlist; /* stack of start points on sequence to work on */ int lpos; /* position in endlist, startlist */ int k1, k2, k3; /* start, mid, end in model */ char t1, t2, t3; /* start, mid, end in state type */ int s1, s2, s3; /* start, mid, end in sequence */ float sc; /* score of segment optimal alignment */ float ret_sc; /* optimal score over complete seq */ int tlen; /* length needed for trace */ int i, k, tpos; /* index in sequence, model, trace */ /* Initialize. */ kassign = MallocOrDie (sizeof(int) * (L+1)); tassign = MallocOrDie (sizeof(char)* (L+1)); endlist = MallocOrDie (sizeof(int) * (L+1)); startlist = MallocOrDie (sizeof(int) * (L+1)); lpos = 0; startlist[lpos] = 1; endlist[lpos] = L; kassign[1] = 1; kassign[L] = hmm->M; tassign[1] = STS; /* temporary boundary condition! will become N or M */ tassign[L] = STT; /* temporary boundary condition! will become M or C */ /* Recursive divide-and-conquer alignment. */ while (lpos >= 0) { /* Pop a segment off the stack */ s1 = startlist[lpos]; k1 = kassign[s1]; t1 = tassign[s1]; s3 = endlist[lpos]; k3 = kassign[s3]; t3 = tassign[s3]; lpos--; /* find optimal midpoint of segment */ sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2); kassign[s2] = k2; tassign[s2] = t2; /* score is valid on first pass */ if (t1 == STS && t3 == STT) ret_sc = sc; /* push N-terminal segment on stack */ if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS))) { lpos++; startlist[lpos] = s1; endlist[lpos] = s2; } /* push C-terminal segment on stack */ if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT))) { lpos++; startlist[lpos] = s2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -