core_algorithms.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,965 行 · 第 1/5 页

C
1,965
字号
 *           We don't (yet) worry about overflow issues like we did with *           P7ViterbiSize(). We'll have many other 32-bit int issues in the *           code if we overflow here. * * Args:     L - length of sequence *           M - length of HMM    *                * Returns:  # of MB */intP7SmallViterbiSize(int L, int M){  return ((2 * sizeof(struct dpmatrix_s) +	   12 * (M+2) * sizeof(int)      + /* 2 matrices w/ 2 rows */ 	   16 * sizeof(int *)            + /* ptrs into rows of matrix */	   20 * sizeof(int)              + /* 5 special states     */	   2  * (L+1) * sizeof(int))       /* traceback indices    */      	  / 1000000);}/* Function: P7WeeViterbiSize() * Date:     SRE, Fri Mar  6 15:40:42 1998 [St. Louis] * * Purpose:  Returns the ballpark predicted memory requirement for *           a P7WeeViterbi() alignment, in MB.  * * Args:     L - length of sequence *           M - length of HMM    *                * Returns:  # of MB  */intP7WeeViterbiSize(int L, int M){  return ((2 * sizeof(struct dpmatrix_s) +	   12 * (M+2) * sizeof(int)      + /* 2 matrices w/ 2 rows */ 	   16 * sizeof(int *)            + /* ptrs into rows of matrix */	   20 * sizeof(int)              + /* 5 special states      */	   2  * (L+2) * sizeof(int) +      /* stacks for starts/ends (overkill) */      	   (L+2) * sizeof(int) +           /* k assignments to seq positions */	   (L+2) * sizeof(char))           /* state assignments to seq pos */	  / 1000000);}/* Function: P7Forward() *  * Purpose:  The Forward dynamic programming algorithm. *           The scaling issue is dealt with by working in log space *           and calling ILogsum(); this is a slow but robust approach. *            * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           ret_mx - RETURN: dp matrix; pass NULL if it's not wanted *            * Return:   log P(S|M)/P(S|R), as a bit score. */floatP7Forward(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx){  struct dpmatrix_s *mx;  int **xmx;  int **mmx;  int **imx;  int **dmx;  int   i,k;  int   sc;  /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.   */   mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);  /* Initialization of the zero row.   * Note that xmx[i][stN] = 0 by definition for all i,   *    and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need   *    to be calculated in DP matrices.   */  xmx[0][XMN] = 0;		                     /* S->N, p=1            */  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */  for (k = 0; k <= hmm->M; k++)    mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */  /* Recursion. Done as a pull.   * Note some slightly wasteful boundary conditions:     *    tsc[0] = -INFTY for all eight transitions (no node 0)   */  for (i = 1; i <= L; i++)    {      mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;      for (k = 1; k < hmm->M; k++)	{	  mmx[i][k]  = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[TMM][k-1],				     imx[i-1][k-1] + hmm->tsc[TIM][k-1]),			      ILogsum(xmx[i-1][XMB] + hmm->bsc[k],				     dmx[i-1][k-1] + hmm->tsc[TDM][k-1]));	  mmx[i][k] += hmm->msc[dsq[i]][k];	  dmx[i][k]  = ILogsum(mmx[i][k-1] + hmm->tsc[TMD][k-1],			      dmx[i][k-1] + hmm->tsc[TDD][k-1]);	  imx[i][k]  = ILogsum(mmx[i-1][k] + hmm->tsc[TMI][k],			      imx[i-1][k] + hmm->tsc[TII][k]);	  imx[i][k] += hmm->isc[dsq[i]][k];	}      mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[TMM][hmm->M-1],				   imx[i-1][hmm->M-1] + hmm->tsc[TIM][hmm->M-1]),			       ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M],				   dmx[i-1][hmm->M-1] + hmm->tsc[TDM][hmm->M-1]));      mmx[i][hmm->M] += hmm->msc[dsq[i]][hmm->M];      /* Now the special states.       * remember, C and J emissions are zero score by definition       */      xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];      xmx[i][XME] = -INFTY;      for (k = 1; k <= hmm->M; k++)	xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]);      xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP],			   xmx[i][XME]   + hmm->xsc[XTE][LOOP]);      xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE],			    xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]);      xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP],			    xmx[i][XME] + hmm->xsc[XTE][MOVE]);    }			      sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];  if (ret_mx != NULL) *ret_mx = mx;  else                FreePlan7Matrix(mx);  return Scorify(sc);		/* the total Forward score. */}      #ifdef SLOW/* Function: P7Viterbi() *  * Purpose:  The Viterbi dynamic programming algorithm.  *           Identical to Forward() except that max's *           replace sum's.  *            *           This is the slower, more understandable version *           of P7Viterbi(). The default version in fast_algorithms.c *           is portably optimized and more difficult to understand; *           the ALTIVEC version in fast_algorithms.c is vectorized *           with Altivec-specific code, and is pretty opaque. *            *           This function is not enabled by default; it is only *           activated by -DSLOW at compile time. *            * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           mx     - reused DP matrix *           ret_tr - RETURN: traceback; pass NULL if it's not wanted *            * Return:   log P(S|M)/P(S|R), as a bit score */floatP7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr){  struct p7trace_s  *tr;  int **xmx;  int **mmx;  int **imx;  int **dmx;  int   i,k;  int   sc;  /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.   */   ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);  /* Initialization of the zero row.   */  xmx[0][XMN] = 0;		                     /* S->N, p=1            */  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */  for (k = 0; k <= hmm->M; k++)    mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */  /* Recursion. Done as a pull.   * Note some slightly wasteful boundary conditions:     *    tsc[0] = -INFTY for all eight transitions (no node 0)   *    D_M and I_M are wastefully calculated (they don't exist)   */  for (i = 1; i <= L; i++) {    mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;    for (k = 1; k <= hmm->M; k++) {				/* match state */      mmx[i][k]  = -INFTY;      if ((sc = mmx[i-1][k-1] + hmm->tsc[TMM][k-1]) > mmx[i][k])	mmx[i][k] = sc;      if ((sc = imx[i-1][k-1] + hmm->tsc[TIM][k-1]) > mmx[i][k])	mmx[i][k] = sc;      if ((sc = xmx[i-1][XMB] + hmm->bsc[k]) > mmx[i][k])	mmx[i][k] = sc;      if ((sc = dmx[i-1][k-1] + hmm->tsc[TDM][k-1]) > mmx[i][k])	mmx[i][k] = sc;      if (hmm->msc[dsq[i]][k] != -INFTY) mmx[i][k] += hmm->msc[dsq[i]][k];      else                                     mmx[i][k] = -INFTY;				/* delete state */      dmx[i][k] = -INFTY;      if ((sc = mmx[i][k-1] + hmm->tsc[TMD][k-1]) > dmx[i][k])	dmx[i][k] = sc;      if ((sc = dmx[i][k-1] + hmm->tsc[TDD][k-1]) > dmx[i][k])	dmx[i][k] = sc;				/* insert state */      if (k < hmm->M) {	imx[i][k] = -INFTY;	if ((sc = mmx[i-1][k] + hmm->tsc[TMI][k]) > imx[i][k])	  imx[i][k] = sc;	if ((sc = imx[i-1][k] + hmm->tsc[TII][k]) > imx[i][k])	  imx[i][k] = sc;	if (hmm->isc[dsq[i]][k] != -INFTY) imx[i][k] += hmm->isc[dsq[i]][k];	else                                    imx[i][k] = -INFTY;         }    }    /* Now the special states. Order is important here.     * remember, C and J emissions are zero score by definition,     */				/* N state */    xmx[i][XMN] = -INFTY;    if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)      xmx[i][XMN] = sc;				/* E state */    xmx[i][XME] = -INFTY;    for (k = 1; k <= hmm->M; k++)      if ((sc =  mmx[i][k] + hmm->esc[k]) > xmx[i][XME])	xmx[i][XME] = sc;				/* J state */    xmx[i][XMJ] = -INFTY;    if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)      xmx[i][XMJ] = sc;    if ((sc = xmx[i][XME]   + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])      xmx[i][XMJ] = sc;				/* B state */    xmx[i][XMB] = -INFTY;    if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)      xmx[i][XMB] = sc;    if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])      xmx[i][XMB] = sc;				/* C state */    xmx[i][XMC] = -INFTY;    if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)      xmx[i][XMC] = sc;    if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])      xmx[i][XMC] = sc;  }				/* T state (not stored) */  sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];  if (ret_tr != NULL) {    P7ViterbiTrace(hmm, dsq, L, mx, &tr);    *ret_tr = tr;  }  return Scorify(sc);		/* the total Viterbi score. */}#endif /*SLOW*//* Function: P7ViterbiTrace() * Date:     SRE, Sat Aug 23 10:30:11 1997 (St. Louis Lambert Field)  *  * Purpose:  Traceback of a Viterbi matrix: i.e. retrieval  *           of optimum alignment. *            * Args:     hmm    - hmm, log odds form, used to make mx *           dsq    - sequence aligned to (digital form) 1..N   *           N      - length of seq *           mx     - the matrix to trace back in, N x hmm->M *           ret_tr - RETURN: traceback. *            * Return:   (void) *           ret_tr is allocated here. Free using P7FreeTrace(). */voidP7ViterbiTrace(struct plan7_s *hmm, unsigned char *dsq, int N,	       struct dpmatrix_s *mx, struct p7trace_s **ret_tr){  struct p7trace_s *tr;  int curralloc;		/* current allocated length of trace */  int tpos;			/* position in trace */  int i;			/* position in seq (1..N) */  int k;			/* position in model (1..M) */  int **xmx, **mmx, **imx, **dmx;  int sc;			/* temp var for pre-emission score */  /* Overallocate for the trace.   * S-N-B- ... - E-C-T  : 6 states + N is minimum trace;   * add N more as buffer.                */  curralloc = N * 2 + 6;   P7AllocTrace(curralloc, &tr);  xmx = mx->xmx;  mmx = mx->mmx;  imx = mx->imx;  dmx = mx->dmx;  /* Initialization of trace   * We do it back to front; ReverseTrace() is called later.   */  tr->statetype[0] = STT;  tr->nodeidx[0]   = 0;  tr->pos[0]       = 0;  tr->statetype[1] = STC;  tr->nodeidx[1]   = 0;  tr->pos[1]       = 0;  tpos = 2;  i    = N;			/* current i (seq pos) we're trying to assign */  /* Traceback   */  while (tr->statetype[tpos-1] != STS) {    switch (tr->statetype[tpos-1]) {    case STM:			/* M connects from i-1,k-1, or B */      sc = mmx[i+1][k+1] - hmm->msc[dsq[i+1]][k+1];      if (sc <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (sc == xmx[i][XMB] + hmm->bsc[k+1])	{				/* Check for wing unfolding */	  if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])	    while (k > 0)	      {		tr->statetype[tpos] = STD;		tr->nodeidx[tpos]   = k--;		tr->pos[tpos]       = 0;		tpos++;		if (tpos == curralloc) 		  {				/* grow trace if necessary  */		    curralloc += N;		    P7ReallocTrace(tr, curralloc);		  }	      }	  tr->statetype[tpos] = STB;	  tr->nodeidx[tpos]   = 0;	  tr->pos[tpos]       = 0;	}      else if (sc == mmx[i][k] + hmm->tsc[TMM][k])	{	  tr->statetype[tpos] = STM;	  tr->nodeidx[tpos]   = k--;	  tr->pos[tpos]       = i--;	}      else if (sc == imx[i][k] + hmm->tsc[TIM][k])	{	  tr->statetype[tpos] = STI;	  tr->nodeidx[tpos]   = k;	  tr->pos[tpos]       = i--;	}      else if (sc == dmx[i][k] + hmm->tsc[TDM][k])	{	  tr->statetype[tpos] = STD;	  tr->nodeidx[tpos]   = k--;	  tr->pos[tpos]       = 0;	}      else	Die("traceback failed");      break;    case STD:			/* D connects from M,D */      if (dmx[i][k+1] <= -INFTY) { P7FreeTrace(tr); *ret_tr = NULL; return; }      else if (dmx[i][k+1] == mmx[i][k] + hmm->tsc[TMD][k])	{	  tr->statetype[tpos] = STM;	  tr->nodeidx[tpos]   = k--;	  tr->pos[tpos]       = i--;	}      else if (dmx[i][k+1] == dmx[i][k] + hmm->tsc[TDD][k]) 	{	  tr->statetype[tpos] = STD;	  tr->nodeidx[tpos]   = k--;	  tr->pos[tpos]       = 0;	}      else Die("traceback failed");      break;    case STI:			/* I connects from M,I */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?