core_algorithms.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,965 行 · 第 1/5 页

C
1,965
字号
      sc = imx[i+1][k] - hmm->isc[dsq[i+1]][k];      if (sc <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (sc == mmx[i][k] + hmm->tsc[TMI][k])	{	  tr->statetype[tpos] = STM;	  tr->nodeidx[tpos]   = k--;	  tr->pos[tpos]       = i--;	}      else if (sc == imx[i][k] + hmm->tsc[TII][k])	{	  tr->statetype[tpos] = STI;	  tr->nodeidx[tpos]   = k;	  tr->pos[tpos]       = i--;	}      else Die("traceback failed");      break;    case STN:			/* N connects from S, N */      if (i == 0 && xmx[i][XMN] == 0)	{	  tr->statetype[tpos] = STS;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;	}      else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP])	{	  tr->statetype[tpos] = STN;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;    /* note convention adherence:  */	  tr->pos[tpos-1]     = i--;  /* first N doesn't emit        */	}      else Die("traceback failed");      break;    case STB:			/* B connects from N, J */      if (xmx[i][XMB] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE])	{	  tr->statetype[tpos] = STN;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;	}      else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE])	{	  tr->statetype[tpos] = STJ;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;	}      else Die("traceback failed");      break;    case STE:			/* E connects from any M state. k set here */      if (xmx[i][XME] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      for (k = hmm->M; k >= 1; k--)	if (xmx[i][XME] == mmx[i][k] + hmm->esc[k])	  {				/* check for wing unfolding */	    if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <=  hmm->esc[k])	      {		int dk;		/* need a tmp k while moving thru delete wing */		for (dk = hmm->M; dk > k; dk--)		  {		    tr->statetype[tpos] = STD;		    tr->nodeidx[tpos]   = dk;		    tr->pos[tpos]       = 0;		    tpos++;		    if (tpos == curralloc) 		      {				/* grow trace if necessary  */			curralloc += N;			P7ReallocTrace(tr, curralloc);		      }		  }	      }	    tr->statetype[tpos] = STM;	    tr->nodeidx[tpos]   = k--;	    tr->pos[tpos]       = i--;	    break;	  }      if (k < 0) Die("traceback failed");      break;    case STC:			/* C comes from C, E */      if (xmx[i][XMC] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP])	{	  tr->statetype[tpos] = STC;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;    /* note convention adherence: */	  tr->pos[tpos-1]     = i--;  /* first C doesn't emit       */	}      else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE])	{	  tr->statetype[tpos] = STE;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0; /* E is a nonemitter */	}            else Die("Traceback failed.");      break;    case STJ:			/* J connects from E, J */      if (xmx[i][XMJ] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP])	{	  tr->statetype[tpos] = STJ;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;    /* note convention adherence: */	  tr->pos[tpos-1]     = i--;  /* first J doesn't emit       */	}      else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP])	{	  tr->statetype[tpos] = STE;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0; /* E is a nonemitter */	}      else Die("Traceback failed.");      break;    default:      Die("traceback failed");    } /* end switch over statetype[tpos-1] */        tpos++;    if (tpos == curralloc)       {				/* grow trace if necessary  */	curralloc += N;	P7ReallocTrace(tr, curralloc);      }  } /* end traceback, at S state; tpos == tlen now */  tr->tlen = tpos;  P7ReverseTrace(tr);  *ret_tr = tr;}/* Function: P7SmallViterbi() * Date:     SRE, Fri Mar  6 15:29:41 1998 [St. Louis] * * Purpose:  Wrapper function, for linear memory alignment *           with same arguments as P7Viterbi().  *            *           Calls P7ParsingViterbi to break the sequence *           into fragments. Then, based on size of fragments, *           calls either P7Viterbi() or P7WeeViterbi() to  *           get traces for them. Finally, assembles all these *           traces together to produce an overall optimal *           trace for the sequence. *            *           If the trace isn't needed for some reason, *           all we do is call P7ParsingViterbi. * * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           mx     - DP matrix, growable *           ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Returns:  Score of optimal alignment in bits. */floatP7SmallViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr){  struct p7trace_s *ctr;        /* collapsed trace of optimal parse */  struct p7trace_s *tr;         /* full trace of optimal alignment */  struct p7trace_s **tarr;      /* trace array */     int   ndom;			/* number of subsequences */  int   i;			/* counter over domains   */  int   pos;			/* position in sequence */  int   tpos;			/* position in trace */  int   tlen;			/* length of full trace   */  int   sqlen;			/* length of a subsequence */  int   totlen;                 /* length of L matched by model (as opposed to N/C/J) */  float sc;			/* score of optimal alignment */  int   t2;			/* position in a subtrace */    /* Step 1. Call P7ParsingViterbi to calculate an optimal parse   *         of the sequence into single-hit subsequences; this parse   *         is returned in a "collapsed" trace   */  sc = P7ParsingViterbi(dsq, L, hmm, &ctr);  /* If we don't want full trace, we're done;   * also, if parsing viterbi returned a NULL trace we're done. */  if (ctr == NULL || ret_tr == NULL)    {      P7FreeTrace(ctr);      return sc;    }    /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence   *         to recover a full traceback of each, collecting them in    *         an array.    */  ndom = ctr->tlen/2 - 1;  tarr = MallocOrDie(sizeof(struct p7trace_s *) * ndom);  tlen = totlen = 0;  for (i = 0; i < ndom; i++)    {      sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1];   /* length of subseq */      if (P7ViterbiSpaceOK(sqlen, hmm->M, mx))	{	  SQD_DPRINTF1(("      -- using P7Viterbi on an %dx%d subproblem\n",			hmm->M, sqlen));	  P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, mx, &(tarr[i]));	}      else	{	  SQD_DPRINTF1(("      -- using P7WeeViterbi on an %dx%d subproblem\n",			hmm->M, sqlen));	  P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));	}      tlen  += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */      totlen += sqlen;    }  /* Step 3. Compose the subtraces into one big final trace.   *         This is wasteful because we're going to TraceDecompose()   *         it again in both hmmsearch and hmmpfam to look at   *         individual domains; but we do it anyway so the P7SmallViterbi   *         interface looks exactly like the P7Viterbi interface. Maybe   *         long traces shouldn't include all the N/J/C states anyway,   *         since they're unambiguously implied.   */  /* Calculate total trace len and alloc;   * nonemitting SNCT + nonemitting J's + emitting NJC   */  tlen += 4 + (ndom-1) + (L-totlen);  P7AllocTrace(tlen, &tr);  tr->tlen = tlen;  /* Add N-terminal trace framework   */  tr->statetype[0] = STS;  tr->nodeidx[0]   = 0;  tr->pos[0]       = 0;  tr->statetype[1] = STN;  tr->nodeidx[1]   = 0;  tr->pos[1]       = 0;  tpos = 2;				/* add implied N's */  for (pos = 1; pos <= ctr->pos[1]; pos++)    {      tr->statetype[tpos] = STN;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = pos;      tpos++;    }  /* Add each subseq trace in, with its appropriate    * sequence offset derived from the collapsed trace   */  for (i = 0; i < ndom; i++)    {				/* skip SN, CT framework at ends */      for (t2 = 2; t2 < tarr[i]->tlen-2; t2++)	{	  tr->statetype[tpos] = tarr[i]->statetype[t2];	  tr->nodeidx[tpos]   = tarr[i]->nodeidx[t2];	  if (tarr[i]->pos[t2] > 0) 	    tr->pos[tpos]       = tarr[i]->pos[t2] + ctr->pos[i*2+1];	  else	    tr->pos[tpos]       = 0;	  tpos++;	}				/* add nonemitting J or C */      tr->statetype[tpos] = (i == ndom-1) ? STC : STJ;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = 0;      tpos++;				/* add implied emitting J's */      if (i != ndom-1)	for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++)	  {	    tr->statetype[tpos] = STJ;	    tr->nodeidx[tpos]   = 0;	    tr->pos[tpos]       = pos;	    tpos++; 	  }    }				/* add implied C's */  for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++)    {      tr->statetype[tpos] = STC;      tr->nodeidx[tpos]   = 0;      tr->pos[tpos]       = pos;      tpos++;    }				/* add terminal T */  tr->statetype[tpos] = STT;  tr->nodeidx[tpos]   = 0;  tr->pos[tpos]       = 0;  tpos++;	      for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]);  free(tarr);  P7FreeTrace(ctr);  *ret_tr = tr;  return sc;}/* Function: P7ParsingViterbi() * Date:     SRE, Wed Mar  4 14:07:31 1998 [St. Louis] * * Purpose:  The "hmmfs" linear-memory algorithm for finding *           the optimal alignment of a very long sequence to *           a looping, multihit (e.g. Plan7) model, parsing it into *           a series of nonoverlapping subsequences that match   *           the model once. Other algorithms (e.g. P7Viterbi() *           or P7WeeViterbi()) are applied subsequently to *           these subsequences to recover complete alignments. *            *           The hmmfs algorithm appears briefly in [Durbin98], *           but is otherwise unpublished. *            *           The traceback structure returned is special: a *           "collapsed" trace S->B->E->...->B->E->T, where *           stateidx is unused and pos is used to indicate the *           position of B and E in the sequence. The matched *           subsequence is B_pos+1...E_pos. The number of *           matches in the trace is (tlen/2)-1. * * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model (log odds scores ready) *           ret_tr - RETURN: a collapsed traceback.       * * Returns:  Score of the optimal Viterbi alignment, in bits. */floatP7ParsingViterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){  struct dpmatrix_s *mx;        /* two rows of score matrix */  struct dpmatrix_s *tmx;       /* two rows of misused score matrix: traceback ptrs */  struct p7trace_s  *tr;        /* RETURN: collapsed traceback */  int  **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */    int  **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */  int   *btr, *etr;             /* O(L) trace ptrs for B, E state pts in seq */      int    sc;			/* integer score of optimal alignment  */  int    i,k,tpos;		/* index for seq, model, trace position */  int    cur, prv;		/* indices for rolling dp matrix */  int    curralloc;		/* size of allocation for tr */  /* Alloc a DP matrix and traceback pointers, two rows each, O(M).   * Alloc two O(L) arrays to trace back through the sequence thru B and E.   */  mx  = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);  tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr);  btr = MallocOrDie(sizeof(int) * (L+1));  etr = MallocOrDie(sizeof(int) * (L+1));  /* Initialization of the zero row.   */  xmx[0][XMN] = 0;		                     /* S->N, p=1            */  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */  btr[0]      = 0;  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */  etr[0]      = -1;   for (k = 0; k <= hmm->M; k++)    mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */  /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick.   * Note some slightly wasteful boundary conditions:     *    tsc[0] = -INFTY for all eight transitions (no node 0)   *    D_M and I_M are wastefully calculated (they don't exist)   *       * Notes on traceback pointer propagation.   *  - In the path B->E, we propagate the i that B was aligned to in the optimal   *    alignment, via mtr, dtr, and itr.    *  - When we reach an E, we record the i of the B it started from in etr.   *  - In a looping path E->J...->B or terminal path E->C...->T, we propagate   *    the i that E was aligned to in the optimal alignment via xtr[][XMC]   *    and xtr[][XMJ].   *  - When we enter B, we record the i of the best previous E, or 0 if there   *    isn't one, in btr.   */  for (i = 1; i <= L; i++) {    cur = i % 2;    prv = !cur;        mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?