📄 postprob.c
字号:
/************************************************************ * Copyright (C) 1998 Ian Holmes (ihh@sanger.ac.uk) * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* postprob.c * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998) * Derived from core_algorithms.c (SRE, Nov 11 1996) * Incorporated SRE, Sat Nov 6 09:07:12 1999 [Cold Spring Harbor] * * RCS $Id: postprob.c,v 1.1 1999/12/04 22:54:17 eddy Exp $ ***************************************************************** * IHH's notes: * * Functions for working with posterior probabilities, * including unfussed "backwards" and "optimal accuracy" * implementations. ***************************************************************** * SRE's notes: * * Simple API example: * struct p7trace_s *tr; * struct dpmatrix_s *fwd; * struct dpmatrix_s *bck; * struct dpmatrix_s *posterior; * char *postcode; * * (get a traceback from somewhere: P7Viterbi() or a modelmaker) * (get an HMM from somewhere: read file or construct it) * P7Forward (dsq, len, hmm, &fwd); * P7Backward(dsq, len, hmm, &bck); * posterior = bck; -- can alloc posterior, but also can re-use bck -- * P7EmitterPosterior(len, hmm, fwd, bck, posterior); * postcode = PostalCode(len, posterior, tr); * * MSAAppendGR(msa, "POST", seqidx, postcode); -- or a similar annotation call -- * * free(postcode); * FreePlan7Matrix(fwd); * FreePlan7Matrix(bck); * * P7OptimalAccuracy() - the Durbin/Holmes optimal accuracy * alignment algorithm. Takes a sequence * and an HMM, returns an alignment as * a trace structure. * * P7Backward() - The Backward() algorithm, counterpart * of P7Forward() in core_algorithms.c. * * P7EmitterPosterior()- The heart of postprob.c: given a Forward * and a Backward matrix, calculate a new matrix * that contains the posterior probabilities * for each symbol i being emitted by * state k (so, \sum_k p(k | x_i) = 1.0). * * P7FillOptimalAccuracy() - The core DP algorithm called by * P7OptimalAccuracy(). * * P7OptimalAccuracyTrace() - the traceback algorithm called by * P7FillOptimalAccuracy(). * * PostalCode() - Create a character string for annotating * an alignment. * * No small memory variants of these algorithms are available * right now. */#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"/* Function: P7OptimalAccuracy() * * Purpose: The optimal accuracy dynamic programming algorithm. * Identical to Viterbi() except that posterior residue * label probabilities are used as scores. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Return: log ( sum_{residues} P(label|M,D) ), as a bit score * (i.e. log of expected accuracy) */floatP7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){ double sc; struct dpmatrix_s *forward; struct dpmatrix_s *backward; (void) P7Forward(dsq, L, hmm, &forward); (void) P7Backward(dsq, L, hmm, &backward); P7EmitterPosterior(L, hmm, forward, backward, backward); /* Re-use backward matrix for posterior scores */ sc = P7FillOptimalAccuracy(L, hmm->M, backward, forward, ret_tr); /* Re-use forward matrix for optimal accuracy scores */ FreePlan7Matrix(forward); FreePlan7Matrix(backward); return sc;}/* Function: P7Backward() * * Purpose: The Backward dynamic programming algorithm. * The scaling issue is dealt with by working in log space * and calling ILogsum(); this is a slow but robust approach. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted * * Return: log P(S|M)/P(S|R), as a bit score. */floatP7Backward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx){ struct dpmatrix_s *mx; int **xmx; int **mmx; int **imx; int **dmx; int i,k; int sc; /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. */ mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); /* Initialization of the L row. * Note that xmx[i][stS] = xmx[i][stN] by definition for all i, * so stS need not be calculated in backward DP matrices. */ xmx[L][XMC] = hmm->xsc[XTC][MOVE]; /* C<-T */ xmx[L][XME] = xmx[L][XMC] + hmm->xsc[XTE][MOVE]; /* E<-C, no C-tail */ xmx[L][XMJ] = xmx[L][XMB] = xmx[L][XMN] = -INFTY; /* need seq to get out from here */ for (k = hmm->M; k >= 1; k--) { mmx[L][k] = xmx[L][XME] + hmm->esc[k]; /* M<-E ... */ mmx[L][k] += hmm->msc[(int) dsq[L]][k]; /* ... + emitted match symbol */ imx[L][k] = dmx[L][k] = -INFTY; /* need seq to get out from here */ } /* Recursion. Done as a pull. * Note slightly wasteful boundary conditions: * M_M precalculated, D_M set to -INFTY, * D_1 wastefully calculated. * Scores for transitions to D_M also have to be hacked to -INFTY, * as Plan7Logoddsify does not do this for us (I think? - ihh). */ hmm->tsc[hmm->M-1][TDD] = hmm->tsc[hmm->M-1][TMD] = -INFTY; /* no D_M state -- HACK -- should be in Plan7Logoddsify */ for (i = L-1; i >= 0; i--) { /* Do the special states first. * remember, C, N and J emissions are zero score by definition */ xmx[i][XMC] = xmx[i+1][XMC] + hmm->xsc[XTC][LOOP]; xmx[i][XMB] = -INFTY; /* The following section has been hacked to fit a bug in core_algorithms.c * The "correct" code is: * for (k = hmm->M; k >= 1; k--) * xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]; * * The following code gives the same results as core_algorithms.c: */ xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][hmm->M] + hmm->bsc[hmm->M-1]); for (k = hmm->M-1; k >= 1; k--) xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]); xmx[i][XMJ] = ILogsum(xmx[i][XMB] + hmm->xsc[XTJ][MOVE], xmx[i+1][XMJ] + hmm->xsc[XTJ][LOOP]); xmx[i][XME] = ILogsum(xmx[i][XMC] + hmm->xsc[XTE][MOVE], xmx[i][XMJ] + hmm->xsc[XTE][LOOP]); xmx[i][XMN] = ILogsum(xmx[i][XMB] + hmm->xsc[XTN][MOVE], xmx[i+1][XMN] + hmm->xsc[XTN][LOOP]); /* Now the main states. Note the boundary conditions at M. */ if (i>0) { mmx[i][hmm->M] = xmx[i][XME] + hmm->esc[hmm->M] + hmm->msc[(int) dsq[i]][hmm->M]; dmx[i][hmm->M] = -INFTY; for (k = hmm->M-1; k >= 1; k--) { mmx[i][k] = ILogsum(ILogsum(xmx[i][XME] + hmm->esc[k], mmx[i+1][k+1] + hmm->tsc[k][TMM]), ILogsum(imx[i+1][k] + hmm->tsc[k][TMI], dmx[i][k+1] + hmm->tsc[k][TMD])); mmx[i][k] += hmm->msc[(int) dsq[i]][k]; imx[i][k] = ILogsum(imx[i+1][k] + hmm->tsc[k][TII], mmx[i+1][k+1] + hmm->tsc[k][TIM]); imx[i][k] += hmm->isc[(int) dsq[i]][k]; dmx[i][k] = ILogsum(dmx[i][k+1] + hmm->tsc[k][TDD], mmx[i+1][k+1] + hmm->tsc[k][TDM]); } } } sc = xmx[0][XMN]; if (ret_mx != NULL) *ret_mx = mx; else FreePlan7Matrix(mx); return Scorify(sc); /* the total Backward score. */}/* Function: P7EmitterPosterior() * * Purpose: Combines Forward and Backward matrices into a posterior * probability matrix. * The entries in row i of this matrix are the logs of the * posterior probabilities of each state emitting symbol i of * the sequence, i.e. all entries for non-emitting states are -INFTY. * The caller must allocate space for the matrix, although the * backward matrix can be used instead (overwriting it will not * compromise the algorithm). * * Args: L - length of sequence * hmm - the model * forward - pre-calculated forward matrix * backward - pre-calculated backward matrix * mx - pre-allocated dynamic programming matrix * * Return: void */voidP7EmitterPosterior(int L, struct plan7_s *hmm, struct dpmatrix_s *forward, struct dpmatrix_s *backward, struct dpmatrix_s *mx){ int i; int k; int sc; sc = backward->xmx[0][XMN]; for (i = L; i >= 1; i--) { mx->xmx[i][XMC] = forward->xmx[i-1][XMC] + hmm->xsc[XTC][LOOP] + backward->xmx[i][XMC] - sc; mx->xmx[i][XMJ] = forward->xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP] + backward->xmx[i][XMJ] - sc; mx->xmx[i][XMN] = forward->xmx[i-1][XMN] + hmm->xsc[XTN][LOOP] + backward->xmx[i][XMN] - sc; mx->xmx[i][XMB] = mx->xmx[i][XME] = -INFTY; for (k = 1; k < hmm->M; k++) { mx->mmx[i][k] = backward->mmx[i][k]; mx->mmx[i][k] += ILogsum(ILogsum(forward->mmx[i-1][k-1] + hmm->tsc[k-1][TMM], forward->imx[i-1][k-1] + hmm->tsc[k-1][TIM]), ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[k], forward->dmx[i-1][k-1] + hmm->tsc[k-1][TDM])); mx->mmx[i][k] -= sc; mx->imx[i][k] = backward->imx[i][k]; mx->imx[i][k] += ILogsum(forward->mmx[i-1][k] + hmm->tsc[k][TMI], forward->imx[i-1][k] + hmm->tsc[k][TII]); mx->imx[i][k] -= sc; mx->dmx[i][k] = -INFTY; } mx->mmx[i][hmm->M] = backward->mmx[i][hmm->M]; mx->mmx[i][hmm->M] += ILogsum(ILogsum(forward->mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM], forward->imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]), ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[hmm->M], forward->dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM])); mx->mmx[i][hmm->M] -= sc; mx->imx[i][hmm->M] = mx->dmx[i][hmm->M] = mx->dmx[i][0] = -INFTY; }}/* Function: P7FillOptimalAccuracy() * * Purpose: The core of the optimal accuracy dynamic programming algorithm. * Identical to Viterbi() except that scores are given by a * posterior matrix (that the caller must pre-calculate). * Also, the caller must pre-allocate the optimal accuracy matrix * (this allows the forward matrix to be re-used). * P7OptimalAccuracy() does all this for you and cleans up. * * * Args: L - length of sequence * M - length of model * posterior - pre-calculated emitter posterior matrix * mx - pre-allocated dynamic programming matrix * ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Return: log ( sum_{residues} P(label|M,D) ), as a bit score * (i.e. log of expected accuracy) */float P7FillOptimalAccuracy(int L, int M, struct dpmatrix_s *posterior, struct dpmatrix_s *mx, struct p7trace_s **ret_tr){ struct p7trace_s *tr; int **xmx; int **mmx; int **imx; int **dmx; int i,k; int sc; xmx = mx->xmx; mmx = mx->mmx; imx = mx->imx; dmx = mx->dmx; /* Initialization of the zero row. * Each cell in the optimal accuracy matrix holds the log of the expected * of correctly assigned symbols up to that point. * To begin with, everything is log(0) = -INFTY. */ xmx[0][XMN] = xmx[0][XMB] = xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; for (k = 0; k <= M; k++) mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * D_M and I_M are wastefully calculated (they don't exist) */ for (i = 1; i <= L; i++) { mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; for (k = 1; k <= M; k++) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -