📄 core_algorithms.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* core_algorithms.c * SRE, Mon Nov 11 15:58:52 1996 * CVS $Id: core_algorithms.c,v 1.21 2001/05/14 17:58:44 eddy Exp $ * * Simple and robust "research" implementations of Forward, Backward, * and Viterbi for Plan7. */#include "structs.h"#include "config.h"#include "funcs.h"#include "squid.h"#include <string.h>#include <assert.h>static float get_wee_midpt(struct plan7_s *hmm, char *dsq, int L, int k1, char t1, int s1, int k3, char t3, int s3, int *ret_k2, char *ret_t2, int *ret_s2);/* Function: AllocPlan7Matrix() * * Purpose: Allocate a dynamic programming matrix for standard Forward, * Backward, or Viterbi, with scores kept as scaled log-odds * integers. Keeps 2D arrays compact in RAM in an attempt * to maximize cache hits. Sets up individual ptrs to the * four matrix components as a convenience. * * Args: rows - number of rows to allocate; typically L+1 * M - size of model * xmx, mmx, imx, dmx * - RETURN: ptrs to four mx components as a convenience * * Return: mx * mx is allocated here. Caller frees with FreeDPMatrix(mx). */struct dpmatrix_s *AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx){ struct dpmatrix_s *mx; int i; mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s)); mx->xmx = (int **) MallocOrDie (sizeof(int *) * rows); mx->mmx = (int **) MallocOrDie (sizeof(int *) * rows); mx->imx = (int **) MallocOrDie (sizeof(int *) * rows); mx->dmx = (int **) MallocOrDie (sizeof(int *) * rows); mx->xmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*5)); mx->mmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); mx->imx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); mx->dmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); for (i = 1; i < rows; i++) { mx->xmx[i] = mx->xmx[0] + (i*5); mx->mmx[i] = mx->mmx[0] + (i*(M+2)); mx->imx[i] = mx->imx[0] + (i*(M+2)); mx->dmx[i] = mx->dmx[0] + (i*(M+2)); } if (xmx != NULL) *xmx = mx->xmx; if (mmx != NULL) *mmx = mx->mmx; if (imx != NULL) *imx = mx->imx; if (dmx != NULL) *dmx = mx->dmx; return mx;}/* Function: FreePlan7Matrix() * * Purpose: Free a dynamic programming matrix allocated by AllocPlan7Matrix(). * * Return: (void) */voidFreePlan7Matrix(struct dpmatrix_s *mx){ free (mx->xmx[0]); free (mx->mmx[0]); free (mx->imx[0]); free (mx->dmx[0]); free (mx->xmx); free (mx->mmx); free (mx->imx); free (mx->dmx); free (mx);}/* Function: AllocShadowMatrix() * * Purpose: Allocate a dynamic programming traceback pointer matrix for * a Viterbi algorithm. * * Args: rows - number of rows to allocate; typically L+1 * M - size of model * xtb, mtb, itb, dtb * - RETURN: ptrs to four mx components as a convenience * * Return: mx * mx is allocated here. Caller frees with FreeDPMatrix(mx). */struct dpshadow_s *AllocShadowMatrix(int rows, int M, char ***xtb, char ***mtb, char ***itb, char ***dtb){ struct dpshadow_s *tb; int i; tb = (struct dpshadow_s *) MallocOrDie (sizeof(struct dpshadow_s)); tb->xtb = (char **) MallocOrDie (sizeof(char *) * rows); tb->mtb = (char **) MallocOrDie (sizeof(char *) * rows); tb->itb = (char **) MallocOrDie (sizeof(char *) * rows); tb->dtb = (char **) MallocOrDie (sizeof(char *) * rows); tb->esrc = (int *) MallocOrDie (sizeof(int) * rows); tb->xtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*5)); tb->mtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); tb->itb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); tb->dtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); for (i = 1; i < rows; i++) { tb->xtb[i] = tb->xtb[0] + (i*5); tb->mtb[i] = tb->mtb[0] + (i*(M+2)); tb->itb[i] = tb->itb[0] + (i*(M+2)); tb->dtb[i] = tb->dtb[0] + (i*(M+2)); } if (xtb != NULL) *xtb = tb->xtb; if (mtb != NULL) *mtb = tb->mtb; if (itb != NULL) *itb = tb->itb; if (dtb != NULL) *dtb = tb->dtb; return tb;}/* Function: FreeShadowMatrix() * * Purpose: Free a dynamic programming matrix allocated by AllocShadowMatrix(). * * Return: (void) */voidFreeShadowMatrix(struct dpshadow_s *tb){ free (tb->xtb[0]); free (tb->mtb[0]); free (tb->itb[0]); free (tb->dtb[0]); free (tb->esrc); free (tb->xtb); free (tb->mtb); free (tb->itb); free (tb->dtb); free (tb);}/* Function: P7ViterbiSize() * Date: SRE, Fri Mar 6 15:13:20 1998 [St. Louis] * * Purpose: Returns the ballpark predicted memory requirement for a * P7Viterbi() alignment, in MB. * * Currently L must fit in an int (< 2 GB), but we have * to deal with LM > 2 GB - e.g. watch out for overflow, do * the whole calculation in floating point. Bug here detected * in 2.1.1 by David Harper, Sanger Centre. * * Args: L - length of sequence * M - length of HMM * * Returns: # of MB */intP7ViterbiSize(int L, int M){ float Mbytes; /* We're excessively precise here, but it doesn't cost * us anything to be pedantic. The four terms are: * 1. the matrix structure itself; * 2. the O(NM) main matrix (this dominates!) * 3. ptrs into the rows of the matrix * 4. storage for 5 special states. (xmx) */ Mbytes = (float) sizeof(struct dpmatrix_s); Mbytes += 3. * (float) (L+1) * (float) (M+2) * (float) sizeof(int); Mbytes += 4. * (float) (L+1) * (float) sizeof(int *); Mbytes += 5. * (float) (L+1) * (float) sizeof(int); Mbytes /= 1048576.; return (int) Mbytes;}/* Function: P7SmallViterbiSize() * Date: SRE, Fri Mar 6 15:20:04 1998 [St. Louis] * * Purpose: Returns the ballpark predicted memory requirement for * a P7SmallViterbi() alignment, in MB. * * P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi() * and P7WeeViterbi(). P7ParsingViterbi() typically dominates * the memory requirement, so the value returned * is the P7ParsingViterbi() number. * * We don't (yet) worry about overflow issues like we did with * P7ViterbiSize(). We'll have many other 32-bit int issues in the * code if we overflow here. * * Args: L - length of sequence * M - length of HMM * * Returns: # of MB */intP7SmallViterbiSize(int L, int M){ return ((2 * sizeof(struct dpmatrix_s) + 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */ 16 * sizeof(int *) + /* ptrs into rows of matrix */ 20 * sizeof(int) + /* 5 special states */ 2 * (L+1) * sizeof(int)) /* traceback indices */ / 1000000);}/* Function: P7WeeViterbiSize() * Date: SRE, Fri Mar 6 15:40:42 1998 [St. Louis] * * Purpose: Returns the ballpark predicted memory requirement for * a P7WeeViterbi() alignment, in MB. * * Args: L - length of sequence * M - length of HMM * * Returns: # of MB */intP7WeeViterbiSize(int L, int M){ return ((2 * sizeof(struct dpmatrix_s) + 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */ 16 * sizeof(int *) + /* ptrs into rows of matrix */ 20 * sizeof(int) + /* 5 special states */ 2 * (L+2) * sizeof(int) + /* stacks for starts/ends (overkill) */ (L+2) * sizeof(int) + /* k assignments to seq positions */ (L+2) * sizeof(char)) /* state assignments to seq pos */ / 1000000);}/* Function: P7Forward() * * Purpose: The Forward dynamic programming algorithm. * The scaling issue is dealt with by working in log space * and calling ILogsum(); this is a slow but robust approach. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted * * Return: log P(S|M)/P(S|R), as a bit score. */floatP7Forward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx){ struct dpmatrix_s *mx; int **xmx; int **mmx; int **imx; int **dmx; int i,k; int sc; /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. */ mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); /* Initialization of the zero row. * Note that xmx[i][stN] = 0 by definition for all i, * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need * to be calculated in DP matrices. */ xmx[0][XMN] = 0; /* S->N, p=1 */ xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ for (k = 0; k <= hmm->M; k++) mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * tsc[0] = -INFTY for all eight transitions (no node 0) * D_M and I_M are wastefully calculated (they don't exist) */ for (i = 1; i <= L; i++) { mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; for (k = 1; k < hmm->M; k++) { mmx[i][k] = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[k-1][TMM], imx[i-1][k-1] + hmm->tsc[k-1][TIM]), ILogsum(xmx[i-1][XMB] + hmm->bsc[k], dmx[i-1][k-1] + hmm->tsc[k-1][TDM])); mmx[i][k] += hmm->msc[(int) dsq[i]][k]; dmx[i][k] = ILogsum(mmx[i][k-1] + hmm->tsc[k-1][TMD], dmx[i][k-1] + hmm->tsc[k-1][TDD]); imx[i][k] = ILogsum(mmx[i-1][k] + hmm->tsc[k][TMI], imx[i-1][k] + hmm->tsc[k][TII]); imx[i][k] += hmm->isc[(int) dsq[i]][k]; } mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM], imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]), ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M-1], dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM])); mmx[i][hmm->M] += hmm->msc[(int) dsq[i]][hmm->M]; /* Now the special states. * remember, C and J emissions are zero score by definition */ xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]; xmx[i][XME] = -INFTY; for (k = 1; k <= hmm->M; k++) xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]); xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP], xmx[i][XME] + hmm->xsc[XTE][LOOP]); xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE], xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]); xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP], xmx[i][XME] + hmm->xsc[XTE][MOVE]); } sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE]; if (ret_mx != NULL) *ret_mx = mx; else FreePlan7Matrix(mx); return Scorify(sc); /* the total Forward score. */} /* Function: P7Viterbi() * * Purpose: The Viterbi dynamic programming algorithm. * Identical to Forward() except that max's * replace sum's. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Return: log P(S|M)/P(S|R), as a bit score */floatP7Viterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr){ struct dpmatrix_s *mx; struct p7trace_s *tr; int **xmx; int **mmx; int **imx; int **dmx; int i,k; int sc; /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. */ mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); /* Initialization of the zero row. */ xmx[0][XMN] = 0; /* S->N, p=1 */ xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ for (k = 0; k <= hmm->M; k++) mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * tsc[0] = -INFTY for all eight transitions (no node 0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -