📄 postprob.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/************************************************************ * Copyright (C) 1998 Ian Holmes (ihh@sanger.ac.uk) * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* postprob.c * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998) * Derived from core_algorithms.c (SRE, Nov 11 1996) * Incorporated SRE, Sat Nov  6 09:07:12 1999 [Cold Spring Harbor] * * RCS $Id: postprob.c,v 1.1 1999/12/04 22:54:17 eddy Exp $  ***************************************************************** * IHH's notes: *  * Functions for working with posterior probabilities, * including unfussed "backwards" and "optimal accuracy" * implementations. ***************************************************************** * SRE's notes: *  * Simple API example: *     struct p7trace_s  *tr; *     struct dpmatrix_s *fwd; *     struct dpmatrix_s *bck; *     struct dpmatrix_s *posterior; *     char *postcode; *      *     (get a traceback from somewhere: P7Viterbi() or a modelmaker) *     (get an HMM from somewhere: read file or construct it) *     P7Forward (dsq, len, hmm, &fwd); *     P7Backward(dsq, len, hmm, &bck); *     posterior = bck;              -- can alloc posterior, but also can re-use bck -- *     P7EmitterPosterior(len, hmm, fwd, bck, posterior); *     postcode = PostalCode(len, posterior, tr); * *     MSAAppendGR(msa, "POST", seqidx, postcode);  -- or a similar annotation call -- *      *     free(postcode); *     FreePlan7Matrix(fwd); *     FreePlan7Matrix(bck); *      * P7OptimalAccuracy() - the Durbin/Holmes optimal accuracy *                       alignment algorithm. Takes a sequence *                       and an HMM, returns an alignment as *                       a trace structure. *  * P7Backward()        - The Backward() algorithm, counterpart *                       of P7Forward() in core_algorithms.c. *                        * P7EmitterPosterior()- The heart of postprob.c: given a Forward *                       and a Backward matrix, calculate a new matrix *                       that contains the posterior probabilities *                       for each symbol i being emitted by  *                       state k (so, \sum_k p(k | x_i) = 1.0). *                        * P7FillOptimalAccuracy() - The core DP algorithm called by  *                       P7OptimalAccuracy(). *                        * P7OptimalAccuracyTrace() - the traceback algorithm called by *                       P7FillOptimalAccuracy().                          *                        * PostalCode()        - Create a character string for annotating *                       an alignment.                          *                        * No small memory variants of these algorithms are available * right now. */#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"/* Function: P7OptimalAccuracy() *  * Purpose:  The optimal accuracy dynamic programming algorithm.  *           Identical to Viterbi() except that posterior residue *           label probabilities are used as scores. *            * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           ret_tr - RETURN: traceback; pass NULL if it's not wanted *            * Return:   log ( sum_{residues} P(label|M,D) ), as a bit score *           (i.e. log of expected accuracy) */floatP7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){  double sc;  struct dpmatrix_s *forward;  struct dpmatrix_s *backward;  (void) P7Forward(dsq, L, hmm, &forward);  (void) P7Backward(dsq, L, hmm, &backward);  P7EmitterPosterior(L, hmm, forward, backward, backward);           /* Re-use backward matrix for posterior scores */  sc = P7FillOptimalAccuracy(L, hmm->M, backward, forward, ret_tr);  /* Re-use forward matrix for optimal accuracy scores */  FreePlan7Matrix(forward);  FreePlan7Matrix(backward);  return sc;}/* Function: P7Backward() *  * Purpose:  The Backward dynamic programming algorithm. *           The scaling issue is dealt with by working in log space *           and calling ILogsum(); this is a slow but robust approach. *            * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           ret_mx - RETURN: dp matrix; pass NULL if it's not wanted *            * Return:   log P(S|M)/P(S|R), as a bit score. */floatP7Backward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx){  struct dpmatrix_s *mx;  int **xmx;  int **mmx;  int **imx;  int **dmx;  int   i,k;  int   sc;  /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.   */   mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);  /* Initialization of the L row.   * Note that xmx[i][stS] = xmx[i][stN] by definition for all i,   *    so stS need not be calculated in backward DP matrices.   */  xmx[L][XMC] = hmm->xsc[XTC][MOVE];                 /* C<-T                          */  xmx[L][XME] = xmx[L][XMC] + hmm->xsc[XTE][MOVE];   /* E<-C, no C-tail               */  xmx[L][XMJ] = xmx[L][XMB] = xmx[L][XMN] = -INFTY;  /* need seq to get out from here */  for (k = hmm->M; k >= 1; k--) {    mmx[L][k] = xmx[L][XME] + hmm->esc[k];           /* M<-E ...                      */    mmx[L][k] += hmm->msc[(int) dsq[L]][k];          /* ... + emitted match symbol    */    imx[L][k] = dmx[L][k] = -INFTY;                  /* need seq to get out from here */  }  /* Recursion. Done as a pull.   * Note slightly wasteful boundary conditions:   *    M_M precalculated, D_M set to -INFTY,   *    D_1 wastefully calculated.   * Scores for transitions to D_M also have to be hacked to -INFTY,   * as Plan7Logoddsify does not do this for us (I think? - ihh).   */  hmm->tsc[hmm->M-1][TDD] = hmm->tsc[hmm->M-1][TMD] = -INFTY;    /* no D_M state -- HACK -- should be in Plan7Logoddsify */  for (i = L-1; i >= 0; i--)    {      /* Do the special states first.       * remember, C, N and J emissions are zero score by definition       */      xmx[i][XMC] = xmx[i+1][XMC] + hmm->xsc[XTC][LOOP];            xmx[i][XMB] = -INFTY;      /* The following section has been hacked to fit a bug in core_algorithms.c       * The "correct" code is:       * for (k = hmm->M; k >= 1; k--)       * xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k];       *       * The following code gives the same results as core_algorithms.c:       */      xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][hmm->M] + hmm->bsc[hmm->M-1]);      for (k = hmm->M-1; k >= 1; k--)	xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]);            xmx[i][XMJ] = ILogsum(xmx[i][XMB] + hmm->xsc[XTJ][MOVE],			    xmx[i+1][XMJ] + hmm->xsc[XTJ][LOOP]);      xmx[i][XME] = ILogsum(xmx[i][XMC] + hmm->xsc[XTE][MOVE],			    xmx[i][XMJ] + hmm->xsc[XTE][LOOP]);      xmx[i][XMN] = ILogsum(xmx[i][XMB] + hmm->xsc[XTN][MOVE],			    xmx[i+1][XMN] + hmm->xsc[XTN][LOOP]);      /* Now the main states. Note the boundary conditions at M.       */      if (i>0) {	mmx[i][hmm->M] = xmx[i][XME] + hmm->esc[hmm->M] + hmm->msc[(int) dsq[i]][hmm->M];	dmx[i][hmm->M] = -INFTY;	for (k = hmm->M-1; k >= 1; k--)	  {	    mmx[i][k]  = ILogsum(ILogsum(xmx[i][XME] + hmm->esc[k],					 mmx[i+1][k+1] + hmm->tsc[k][TMM]),				 ILogsum(imx[i+1][k] + hmm->tsc[k][TMI],					 dmx[i][k+1] + hmm->tsc[k][TMD]));	    mmx[i][k] += hmm->msc[(int) dsq[i]][k];	    	    imx[i][k]  = ILogsum(imx[i+1][k] + hmm->tsc[k][TII],				 mmx[i+1][k+1] + hmm->tsc[k][TIM]);	    imx[i][k] += hmm->isc[(int) dsq[i]][k];	    	    dmx[i][k]  = ILogsum(dmx[i][k+1] + hmm->tsc[k][TDD],				 mmx[i+1][k+1] + hmm->tsc[k][TDM]);	    	  }      }          }  sc = xmx[0][XMN];  if (ret_mx != NULL) *ret_mx = mx;  else                FreePlan7Matrix(mx);  return Scorify(sc);		/* the total Backward score. */}/* Function: P7EmitterPosterior() * * Purpose:  Combines Forward and Backward matrices into a posterior *           probability matrix. *           The entries in row i of this matrix are the logs of the *           posterior probabilities of each state emitting symbol i of *           the sequence, i.e. all entries for non-emitting states are -INFTY. *           The caller must allocate space for the matrix, although the *           backward matrix can be used instead (overwriting it will not *           compromise the algorithm). *            * Args:     L        - length of sequence *           hmm      - the model *           forward  - pre-calculated forward matrix *           backward - pre-calculated backward matrix *           mx       - pre-allocated dynamic programming matrix *            * Return:   void */voidP7EmitterPosterior(int L,		 struct plan7_s *hmm,		 struct dpmatrix_s *forward,		 struct dpmatrix_s *backward,		 struct dpmatrix_s *mx){  int i;  int k;  int sc;  sc = backward->xmx[0][XMN];    for (i = L; i >= 1; i--)    {      mx->xmx[i][XMC] = forward->xmx[i-1][XMC] + hmm->xsc[XTC][LOOP] + backward->xmx[i][XMC] - sc;            mx->xmx[i][XMJ] = forward->xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP] + backward->xmx[i][XMJ] - sc;       mx->xmx[i][XMN] = forward->xmx[i-1][XMN] + hmm->xsc[XTN][LOOP] + backward->xmx[i][XMN] - sc;      mx->xmx[i][XMB] = mx->xmx[i][XME] = -INFTY;            for (k = 1; k < hmm->M; k++) {	mx->mmx[i][k]  = backward->mmx[i][k];	mx->mmx[i][k] += ILogsum(ILogsum(forward->mmx[i-1][k-1] + hmm->tsc[k-1][TMM],					     forward->imx[i-1][k-1] + hmm->tsc[k-1][TIM]),				     ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[k],					     forward->dmx[i-1][k-1] + hmm->tsc[k-1][TDM]));	mx->mmx[i][k] -= sc;		mx->imx[i][k]  = backward->imx[i][k];	mx->imx[i][k] += ILogsum(forward->mmx[i-1][k] + hmm->tsc[k][TMI],				     forward->imx[i-1][k] + hmm->tsc[k][TII]);	mx->imx[i][k] -= sc;		mx->dmx[i][k] = -INFTY;      }      mx->mmx[i][hmm->M]  = backward->mmx[i][hmm->M];      mx->mmx[i][hmm->M] += ILogsum(ILogsum(forward->mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM],					   forward->imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]),				   ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[hmm->M],					   forward->dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM]));      mx->mmx[i][hmm->M] -= sc;      mx->imx[i][hmm->M] = mx->dmx[i][hmm->M] = mx->dmx[i][0] = -INFTY;          }}/* Function: P7FillOptimalAccuracy() *  * Purpose:  The core of the optimal accuracy dynamic programming algorithm.  *           Identical to Viterbi() except that scores are given by a *           posterior matrix (that the caller must pre-calculate). *           Also, the caller must pre-allocate the optimal accuracy matrix *           (this allows the forward matrix to be re-used). *           P7OptimalAccuracy() does all this for you and cleans up. *   *            * Args:     L         - length of sequence *           M         - length of model *           posterior - pre-calculated emitter posterior matrix *           mx        - pre-allocated dynamic programming matrix *           ret_tr    - RETURN: traceback; pass NULL if it's not wanted *            * Return:   log ( sum_{residues} P(label|M,D) ), as a bit score *           (i.e. log of expected accuracy) */float P7FillOptimalAccuracy(int L,			    int M,			    struct dpmatrix_s *posterior,			    struct dpmatrix_s *mx,			    struct p7trace_s **ret_tr){  struct p7trace_s  *tr;  int **xmx;  int **mmx;  int **imx;  int **dmx;  int   i,k;  int   sc;  xmx = mx->xmx;  mmx = mx->mmx;  imx = mx->imx;  dmx = mx->dmx;  /* Initialization of the zero row.   * Each cell in the optimal accuracy matrix holds the log of the expected   * of correctly assigned symbols up to that point.   * To begin with, everything is log(0) = -INFTY.   */  xmx[0][XMN] = xmx[0][XMB] = xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  for (k = 0; k <= M; k++)    mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;  /* Recursion. Done as a pull.   * Note some slightly wasteful boundary conditions:     *    D_M and I_M are wastefully calculated (they don't exist)   */  for (i = 1; i <= L; i++)    {      mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;            for (k = 1; k <= M; k++)	{
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -