📄 core_algorithms.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 5 页
字号:
    {      sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1];   /* length of subseq */      if (P7ViterbiSize(sqlen, hmm->M) > RAMLIMIT)	P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));      else	P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));      tlen  += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */      totlen += sqlen;    }  /* Step 3. Compose the subtraces into one big final trace.   *         This is wasteful because we're going to TraceDecompose()   *         it again in both hmmsearch and hmmpfam to look at   *         individual domains; but we do it anyway so the P7SmallViterbi   *         interface looks exactly like the P7Viterbi interface. Maybe   *         long traces shouldn't include all the N/J/C states anyway,   *         since they're unambiguously implied.   */  /* Calculate total trace len and alloc;   * nonemitting SNCT + nonemitting J's + emitting NJC   */  tlen += 4 + (ndom-1) + (L-totlen);  P7AllocTrace(tlen, &tr);  tr->tlen = tlen;  /* Add N-terminal trace framework   */  tr->statetype[0] = STS;  tr->nodeidx[0]   = 0;  tr->pos[0]       = 0;  tr->statetype[1] = STN;  tr->nodeidx[1]   = 0;  tr->pos[1]       = 0;  tpos = 2;				/* add implied N's */  for (pos = 1; pos <= ctr->pos[1]; pos++)    {      tr->statetype[tpos] = STN;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = pos;      tpos++;    }  /* Add each subseq trace in, with its appropriate    * sequence offset derived from the collapsed trace   */  for (i = 0; i < ndom; i++)    {				/* skip SN, CT framework at ends */      for (t2 = 2; t2 < tarr[i]->tlen-2; t2++)	{	  tr->statetype[tpos] = tarr[i]->statetype[t2];	  tr->nodeidx[tpos]   = tarr[i]->nodeidx[t2];	  if (tarr[i]->pos[t2] > 0) 	    tr->pos[tpos]       = tarr[i]->pos[t2] + ctr->pos[i*2+1];	  else	    tr->pos[tpos]       = 0;	  tpos++;	}				/* add nonemitting J or C */      tr->statetype[tpos] = (i == ndom-1) ? STC : STJ;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = 0;      tpos++;				/* add implied emitting J's */      if (i != ndom-1)	for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++)	  {	    tr->statetype[tpos] = STJ;	    tr->nodeidx[tpos]   = 0;	    tr->pos[tpos]       = pos;	    tpos++; 	  }    }				/* add implied C's */  for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++)    {      tr->statetype[tpos] = STC;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = pos;      tpos++;    }				/* add terminal T */  tr->statetype[tpos] = STT;  tr->nodeidx[tpos]   = 0;  tr->pos[tpos]       = 0;  tpos++;	      for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]);  free(tarr);  P7FreeTrace(ctr);  *ret_tr = tr;  return sc;}/* Function: P7ParsingViterbi() * Date:     SRE, Wed Mar  4 14:07:31 1998 [St. Louis] * * Purpose:  The "hmmfs" linear-memory algorithm for finding *           the optimal alignment of a very long sequence to *           a looping, multihit (e.g. Plan7) model, parsing it into *           a series of nonoverlapping subsequences that match   *           the model once. Other algorithms (e.g. P7Viterbi() *           or P7WeeViterbi()) are applied subsequently to *           these subsequences to recover complete alignments. *            *           The hmmfs algorithm appears briefly in [Durbin98], *           but is otherwise unpublished. *            *           The traceback structure returned is special: a *           "collapsed" trace S->B->E->...->B->E->T, where *           stateidx is unused and pos is used to indicate the *           position of B and E in the sequence. The matched *           subsequence is B_pos+1...E_pos. The number of *           matches in the trace is (tlen/2)-1. * * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model (log odds scores ready) *           ret_tr - RETURN: a collapsed traceback.       * * Returns:  Score of the optimal Viterbi alignment, in bits. */floatP7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){  struct dpmatrix_s *mx;        /* two rows of score matrix */  struct dpmatrix_s *tmx;       /* two rows of misused score matrix: traceback ptrs */  struct p7trace_s  *tr;        /* RETURN: collapsed traceback */  int  **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */    int  **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */  int   *btr, *etr;             /* O(L) trace ptrs for B, E state pts in seq */      int    sc;			/* integer score of optimal alignment  */  int    i,k,tpos;		/* index for seq, model, trace position */  int    cur, prv;		/* indices for rolling dp matrix */  int    curralloc;		/* size of allocation for tr */  /* Alloc a DP matrix and traceback pointers, two rows each, O(M).   * Alloc two O(L) arrays to trace back through the sequence thru B and E.   */  mx  = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);  tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr);  btr = MallocOrDie(sizeof(int) * (L+1));  etr = MallocOrDie(sizeof(int) * (L+1));  /* Initialization of the zero row.   */  xmx[0][XMN] = 0;		                     /* S->N, p=1            */  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */  btr[0]      = 0;  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */  etr[0]      = -1;   for (k = 0; k <= hmm->M; k++)    mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */  /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick.   * Note some slightly wasteful boundary conditions:     *    tsc[0] = -INFTY for all eight transitions (no node 0)   *    D_M and I_M are wastefully calculated (they don't exist)   *       * Notes on traceback pointer propagation.   *  - In the path B->E, we propagate the i that B was aligned to in the optimal   *    alignment, via mtr, dtr, and itr.    *  - When we reach an E, we record the i of the B it started from in etr.   *  - In a looping path E->J...->B or terminal path E->C...->T, we propagate   *    the i that E was aligned to in the optimal alignment via xtr[][XMC]   *    and xtr[][XMJ].   *  - When we enter B, we record the i of the best previous E, or 0 if there   *    isn't one, in btr.   */  for (i = 1; i <= L; i++) {    cur = i % 2;    prv = !cur;        mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;    for (k = 1; k <= hmm->M; k++) {				/* match state */      mmx[cur][k] = -INFTY;      if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY)	{ mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; }      if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k])	{ mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; }      if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])	{ mmx[cur][k] = sc; mtr[cur][k] = i-1; }      if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])	{ mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; }      if (hmm->msc[(int) dsq[i]][k] != -INFTY)	mmx[cur][k] += hmm->msc[(int) dsq[i]][k];      else	mmx[cur][k] = -INFTY;				/* delete state */      dmx[cur][k] = -INFTY;      if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)	{ dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; }      if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])	{ dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; }				/* insert state */      if (k < hmm->M) {	imx[cur][k] = -INFTY;	if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY)	  { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; }	if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k])	  { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; }	if (hmm->isc[(int) dsq[i]][k] != -INFTY)	  imx[cur][k] += hmm->isc[(int) dsq[i]][k];	else	  imx[cur][k] = -INFTY;      }    }    /* Now the special states. Order is important here.     * remember, C and J emissions are zero score by definition,     */				/* N state */    xmx[cur][XMN] = -INFTY;    if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)      xmx[cur][XMN] = sc;				/* E state */    xmx[cur][XME] = -INFTY;    for (k = 1; k <= hmm->M; k++)      if ((sc =  mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])	{ xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; }				/* J state */    xmx[cur][XMJ] = -INFTY;    if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)      { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; }    if ((sc = xmx[cur][XME]   + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ])      { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; }				/* B state */    xmx[cur][XMB] = -INFTY;    if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)      { xmx[cur][XMB] = sc; btr[i] = 0; }    if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB])      { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; }				/* C state */    xmx[cur][XMC] = -INFTY;    if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)      { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; }    if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])      { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; }  }				/* T state (not stored) */  sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE];  /*****************************************************************   * Collapsed traceback stage.    * xtr[L%2][XMC] contains the position j of the previous E   * etr[j]        contains the position i of the previous B   * btr[i]        contains the position j of the previous E, or 0   * continue until btr[i] = 0.   *****************************************************************/  curralloc = 2;		/* minimum: no hits */  P7AllocTrace(curralloc, &tr);  /* Init of collapsed trace. Back to front; we ReverseTrace() later.   */  tpos = 0;  tr->statetype[tpos] = STT;  tr->pos[tpos]       = 0;  i                   = xtr[L%2][XMC];  while (i > 0)    {      curralloc += 2;      P7ReallocTrace(tr, curralloc);      tpos++;      tr->statetype[tpos] = STE;      tr->pos[tpos] = i;      i = etr[i];      tpos++;      tr->statetype[tpos] = STB;      tr->pos[tpos] = i;      i = btr[i];    }  tpos++;  tr->statetype[tpos] = STS;  tr->pos[tpos]       = 0;  tr->tlen = tpos + 1;  P7ReverseTrace(tr);    FreePlan7Matrix(mx);  FreePlan7Matrix(tmx);  free(btr);  free(etr);  *ret_tr = tr;  return Scorify(sc);}/* Function: P7WeeViterbi() * Date:     SRE, Wed Mar  4 08:24:04 1998 [St. Louis] * * Purpose:  Hirschberg/Myers/Miller linear memory alignment. *           See [Hirschberg75,MyM-88a] for the idea of the algorithm. *           Adapted to HMM implementation.  *            *           Requires that you /know/ that there's only *           one hit to the model in the sequence: either *           because you're forcing single-hit, or you've *           previously called P7ParsingViterbi to parse *           the sequence into single-hit segments. The reason *           for this is that a cyclic model (a la Plan7) *           defeats the nice divide and conquer trick. *           (I think some trickery with propagated trace pointers *           could get around this but haven't explored it.) *           This is implemented by ignoring transitions *           to/from J state. * * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           ret_tr - RETURN: traceback. * * Returns:  Score of the optimal Viterbi alignment. */floatP7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){  struct p7trace_s *tr;         /* RETURN: traceback */  int          *kassign;        /* 0..L+1, alignment of seq positions to model nodes */  char         *tassign;        /* 0..L+1, alignment of seq positions to state types */  int          *endlist;        /* stack of end points on sequence to work on */  int          *startlist;      /* stack of start points on sequence to work on */  int          lpos;            /* position in endlist, startlist */  int          k1, k2, k3;	/* start, mid, end in model      */  char         t1, t2, t3;	/* start, mid, end in state type */  int          s1, s2, s3;	/* start, mid, end in sequence   */  float        sc;		/* score of segment optimal alignment */  float        ret_sc;		/* optimal score over complete seq */  int          tlen;		/* length needed for trace */  int          i, k, tpos;	/* index in sequence, model, trace */  /* Initialize.   */  kassign   = MallocOrDie (sizeof(int) * (L+1));  tassign   = MallocOrDie (sizeof(char)* (L+1));  endlist   = MallocOrDie (sizeof(int) * (L+1));  startlist = MallocOrDie (sizeof(int) * (L+1));  lpos = 0;  startlist[lpos] = 1;  endlist[lpos]   = L;  kassign[1]      = 1;  kassign[L]      = hmm->M;  tassign[1]      = STS;	/* temporary boundary condition! will become N or M */  tassign[L]      = STT;	/* temporary boundary condition! will become M or C */  /* Recursive divide-and-conquer alignment.   */  while (lpos >= 0)    {				/* Pop a segment off the stack */      s1 = startlist[lpos];      k1 = kassign[s1];      t1 = tassign[s1];      s3 = endlist[lpos];      k3 = kassign[s3];      t3 = tassign[s3];      lpos--;				/* find optimal midpoint of segment */      sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2);      kassign[s2] = k2;      tassign[s2] = t2;                               /* score is valid on first pass */      if (t1 == STS && t3 == STT) ret_sc = sc;				/* push N-terminal segment on stack */      if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS)))	{	  lpos++;	  startlist[lpos] = s1;	  endlist[lpos]   = s2;	}				/* push C-terminal segment on stack */      if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT)))	{          lpos++;          startlist[lpos] = s2;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -