core_algorithms.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,965 行 · 第 1/5 页

C
1,965
字号
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-2003 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. ************************************************************//* core_algorithms.c * SRE, Mon Nov 11 15:58:52 1996 * CVS $Id: core_algorithms.c,v 1.35 2003/10/03 18:26:37 eddy Exp $ *  * Simple and robust "research" implementations of Forward, Backward, * and Viterbi for Plan7. For optimized replacements for some of these functions, * see fast_algorithms.c. */#include "config.h"#include "squidconf.h"#include "structs.h"#include "funcs.h"#include "squid.h"#include <string.h>#include <assert.h>static float get_wee_midpt(struct plan7_s *hmm, unsigned char *dsq, int L, 			   int k1, char t1, int s1,			   int k3, char t3, int s3,			   int *ret_k2, char *ret_t2, int *ret_s2);#ifndef ALTIVEC/* Function: CreatePlan7Matrix() *  * Purpose:  Create a dynamic programming matrix for standard Forward, *           Backward, or Viterbi, with scores kept as scaled log-odds *           integers. Keeps 2D arrays compact in RAM in an attempt  *           to maximize cache hits.  *            *           The mx structure can be dynamically grown, if a new *           HMM or seq exceeds the currently allocated size. Dynamic *           growing is more efficient than an alloc/free of a whole *           matrix for every new target. The ResizePlan7Matrix() *           call does this reallocation, if needed. Here, in the *           creation step, we set up some pads - to inform the resizing *           call how much to overallocate when it realloc's.  *            * Args:     N     - N+1 rows are allocated, for sequence.   *           M     - size of model in nodes *           padN  - over-realloc in seq/row dimension, or 0 *           padM  - over-realloc in HMM/column dimension, or 0 *                  * Return:   mx *           mx is allocated here. Caller frees with FreePlan7Matrix(mx). */struct dpmatrix_s *CreatePlan7Matrix(int N, int M, int padN, int padM){  struct dpmatrix_s *mx;  int i;  mx          = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));  mx->xmx     = (int **) MallocOrDie (sizeof(int *) * (N+1));  mx->mmx     = (int **) MallocOrDie (sizeof(int *) * (N+1));  mx->imx     = (int **) MallocOrDie (sizeof(int *) * (N+1));  mx->dmx     = (int **) MallocOrDie (sizeof(int *) * (N+1));  mx->xmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*5));  mx->mmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));  mx->imx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));  mx->dmx_mem = (void *) MallocOrDie (sizeof(int) * ((N+1)*(M+2)));  /* The indirect assignment below looks wasteful; it's actually   * used for aligning data on 16-byte boundaries as a cache    * optimization in the fast altivec implementation   */  mx->xmx[0] = (int *) mx->xmx_mem;  mx->mmx[0] = (int *) mx->mmx_mem;  mx->imx[0] = (int *) mx->imx_mem;  mx->dmx[0] = (int *) mx->dmx_mem;  for (i = 1; i <= N; i++)    {      mx->xmx[i] = mx->xmx[0] + (i*5);       mx->mmx[i] = mx->mmx[0] + (i*(M+2));      mx->imx[i] = mx->imx[0] + (i*(M+2));      mx->dmx[i] = mx->dmx[0] + (i*(M+2));    }  mx->maxN = N;  mx->maxM = M;  mx->padN = padN;  mx->padM = padM;    return mx;}#endif /*ALTIVEC*/#ifndef ALTIVEC/* Function: ResizePlan7Matrix() *  * Purpose:  Reallocate a dynamic programming matrix, if necessary, *           for a problem of NxM: sequence length N, model size M. *           (N=1 for small memory score-only variants; we allocate *           N+1 rows in the DP matrix.)  *            *           We know (because of the way hmmsearch and hmmpfam are coded) *           that only one of the two dimensions is going to change *           in size after the first call to ResizePlan7Matrix(); *           that is, for hmmsearch, we have one HMM of fixed size M *           and our target sequences may grow in N; for hmmpfam, *           we have one sequence of fixed size N and our target models *           may grow in M. What we have to watch out for is P7SmallViterbi() *           working on a divide and conquer problem and passing us N < maxN, *           M > maxM; we should definitely *not* reallocate a smaller N. *           Since we know that only one dimension is going to grow, *           we aren't scared of reallocating to maxN,maxM. (If both *           M and N could grow, we would be more worried.) * *           Returns individual ptrs to the four matrix components *           as a convenience. *            * Args:     mx    - an already allocated model to grow. *           N     - seq length to allocate for; N+1 rows *           M     - size of model *           xmx, mmx, imx, dmx  *                 - RETURN: ptrs to four mx components as a convenience *                    * Return:   (void) *           mx is (re)allocated here. */voidResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M, 		  int ***xmx, int ***mmx, int ***imx, int ***dmx){  int i;  if (N <= mx->maxN && M <= mx->maxM) goto DONE;    if (N > mx->maxN) {    N          += mx->padN;     mx->maxN    = N;     mx->xmx     = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));    mx->mmx     = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));    mx->imx     = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));    mx->dmx     = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));  }  if (M > mx->maxM) {    M += mx->padM;     mx->maxM = M;   }  mx->xmx_mem = (void *) ReallocOrDie (mx->xmx_mem, sizeof(int) * ((mx->maxN+1)*5));  mx->mmx_mem = (void *) ReallocOrDie (mx->mmx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));  mx->imx_mem = (void *) ReallocOrDie (mx->imx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));  mx->dmx_mem = (void *) ReallocOrDie (mx->dmx_mem, sizeof(int) * ((mx->maxN+1)*(mx->maxM+2)));  mx->xmx[0] = (int *) mx->xmx_mem;  mx->mmx[0] = (int *) mx->mmx_mem;  mx->imx[0] = (int *) mx->imx_mem;  mx->dmx[0] = (int *) mx->dmx_mem;  for (i = 1; i <= mx->maxN; i++)    {      mx->xmx[i] = mx->xmx[0] + (i*5);       mx->mmx[i] = mx->mmx[0] + (i*(mx->maxM+2));      mx->imx[i] = mx->imx[0] + (i*(mx->maxM+2));      mx->dmx[i] = mx->dmx[0] + (i*(mx->maxM+2));    } DONE:  if (xmx != NULL) *xmx = mx->xmx;  if (mmx != NULL) *mmx = mx->mmx;  if (imx != NULL) *imx = mx->imx;  if (dmx != NULL) *dmx = mx->dmx;}#endif /*ALTIVEC*//* Function: AllocPlan7Matrix() * Date:     SRE, Tue Nov 19 07:14:47 2002 [St. Louis] * * Purpose:  Used to be the main allocator for dp matrices; we used to *           allocate, calculate, free. But this spent a lot of time *           in malloc(). Replaced with Create..() and Resize..() to *           allow matrix reuse in P7Viterbi(), the main alignment  *           engine. But matrices are alloc'ed by other alignment engines *           too, ones that are less frequently called and less  *           important to optimization of cpu performance. Instead of *           tracking changes through them, for now, provide *           an Alloc...() call with the same API that's just a wrapper. * * Args:     rows  - generally L+1, or 2; # of DP rows in seq dimension to alloc *           M     - size of model, in nodes *           xmx, mmx, imx, dmx  *                 - RETURN: ptrs to four mx components as a convenience * * Returns:  mx *           Caller free's w/ FreePlan7Matrix() */struct dpmatrix_s *AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx){  struct dpmatrix_s *mx;  mx = CreatePlan7Matrix(rows-1, M, 0, 0);  if (xmx != NULL) *xmx = mx->xmx;  if (mmx != NULL) *mmx = mx->mmx;  if (imx != NULL) *imx = mx->imx;  if (dmx != NULL) *dmx = mx->dmx;  return mx;}/* Function: FreePlan7Matrix() *  * Purpose:  Free a dynamic programming matrix allocated by CreatePlan7Matrix(). *  * Return:   (void) */voidFreePlan7Matrix(struct dpmatrix_s *mx){  free (mx->xmx_mem);  free (mx->mmx_mem);  free (mx->imx_mem);  free (mx->dmx_mem);  free (mx->xmx);  free (mx->mmx);  free (mx->imx);  free (mx->dmx);  free (mx);}/* Function: AllocShadowMatrix() *  * Purpose:  Allocate a dynamic programming traceback pointer matrix for  *           a Viterbi algorithm.  *            * Args:     rows  - number of rows to allocate; typically L+1 *           M     - size of model *           xtb, mtb, itb, dtb  *                 - RETURN: ptrs to four mx components as a convenience *                  * Return:   mx *           mx is allocated here. Caller frees with FreeDPMatrix(mx). */struct dpshadow_s *AllocShadowMatrix(int rows, int M, char ***xtb, char ***mtb, char ***itb, char ***dtb){  struct dpshadow_s *tb;  int i;  tb         = (struct dpshadow_s *) MallocOrDie (sizeof(struct dpshadow_s));  tb->xtb    = (char **) MallocOrDie (sizeof(char *) * rows);  tb->mtb    = (char **) MallocOrDie (sizeof(char *) * rows);  tb->itb    = (char **) MallocOrDie (sizeof(char *) * rows);  tb->dtb    = (char **) MallocOrDie (sizeof(char *) * rows);  tb->esrc   = (int *)   MallocOrDie (sizeof(int)  * rows);  tb->xtb[0] = (char *)  MallocOrDie (sizeof(char) * (rows*5));  tb->mtb[0] = (char *)  MallocOrDie (sizeof(char) * (rows*(M+2)));  tb->itb[0] = (char *)  MallocOrDie (sizeof(char) * (rows*(M+2)));  tb->dtb[0] = (char *)  MallocOrDie (sizeof(char) * (rows*(M+2)));  for (i = 1; i < rows; i++)    {      tb->xtb[i] = tb->xtb[0] + (i*5);       tb->mtb[i] = tb->mtb[0] + (i*(M+2));      tb->itb[i] = tb->itb[0] + (i*(M+2));      tb->dtb[i] = tb->dtb[0] + (i*(M+2));    }  if (xtb != NULL) *xtb = tb->xtb;  if (mtb != NULL) *mtb = tb->mtb;  if (itb != NULL) *itb = tb->itb;  if (dtb != NULL) *dtb = tb->dtb;  return tb;}/* Function: FreeShadowMatrix() *  * Purpose:  Free a dynamic programming matrix allocated by AllocShadowMatrix(). *  * Return:   (void) */voidFreeShadowMatrix(struct dpshadow_s *tb){  free (tb->xtb[0]);  free (tb->mtb[0]);  free (tb->itb[0]);  free (tb->dtb[0]);  free (tb->esrc);  free (tb->xtb);  free (tb->mtb);  free (tb->itb);  free (tb->dtb);  free (tb);}/* Function:  P7ViterbiSpaceOK() * Incept:    SRE, Wed Oct  1 12:53:13 2003 [St. Louis] * * Purpose:   Returns TRUE if the existing matrix allocation *            is already sufficient to hold the requested MxN, or if *            the matrix can be expanded in M and/or N without *            exceeding RAMLIMIT megabytes.  *             *            This gets called anytime we're about to do P7Viterbi(). *            If it returns FALSE, we switch into the appropriate *            small-memory alternative: P7SmallViterbi() or *            P7WeeViterbi(). *             *            Checking the DP problem size against P7ViterbiSize() *            is not enough, because the DP matrix may be already *            allocated in MxN. For example, if we're already *            allocated to maxM,maxN of 1447x979, and we try to *            do a problem of MxN=12x125000, P7ViterbiSize() may *            think that's fine - but when we resize, we can only *            grow, so we'll expand to 1447x125000, which is  *            likely over the RAMLIMIT. [bug#h26; xref SLT7 p.122] * * Args:       * * Returns:   TRUE if we can run P7Viterbi(); FALSE if we need *            to use a small memory variant. * * Xref:      STL7 p.122. */intP7ViterbiSpaceOK(int L, int M, struct dpmatrix_s *mx){  int newM;  int newN;  if (M > mx->maxM) newM = M + mx->padM; else newM = mx->maxM;  if (L > mx->maxN) newN = L + mx->padN; else newN = mx->maxN;  if (P7ViterbiSize(newN, newM) <= RAMLIMIT)    return TRUE;  else    return FALSE;}/* Function: P7ViterbiSize() * Date:     SRE, Fri Mar  6 15:13:20 1998 [St. Louis] * * Purpose:  Returns the ballpark predicted memory requirement for a  *           P7Viterbi() alignment, in MB.  *            *           Currently L must fit in an int (< 2 GB), but we have *           to deal with LM > 2 GB - e.g. watch out for overflow, do *           the whole calculation in floating point. Bug here detected *           in 2.1.1 by David Harper, Sanger Centre. *                                     * Args:     L  - length of sequence *           M  - length of HMM        * * Returns:  # of MB */intP7ViterbiSize(int L, int M){  float Mbytes;  /* We're excessively precise here, but it doesn't cost   * us anything to be pedantic. The four terms are:   *   1. the matrix structure itself;   *   2. the O(NM) main matrix (this dominates!)   *   3. ptrs into the rows of the matrix   *   4. storage for 5 special states. (xmx)   */  Mbytes =  (float) sizeof(struct dpmatrix_s);   Mbytes += 3. * (float) (L+1) * (float) (M+2) * (float) sizeof(int);   Mbytes += 4. * (float) (L+1) * (float) sizeof(int *);   Mbytes += 5. * (float) (L+1) * (float) sizeof(int);  Mbytes /= 1048576.;  return (int) Mbytes;}/* Function: P7SmallViterbiSize() * Date:     SRE, Fri Mar  6 15:20:04 1998 [St. Louis] * * Purpose:  Returns the ballpark predicted memory requirement for *           a P7SmallViterbi() alignment, in MB.  *            *           P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi() *           and P7WeeViterbi(). P7ParsingViterbi() typically dominates *           the memory requirement, so the value returned *           is the P7ParsingViterbi() number. *     

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?