fast_algorithms.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,087 行 · 第 1/3 页

C
1,087
字号
 *  *           See additional comments in  *           core_algorithms.c:ResizePlan7Matrix(), the normal version *           of this function. This version is only used in the *           Altivec (--enable-altivec) port. *            *           Returns individual ptrs to the four matrix components *           as a convenience. *            * Args:     mx    - an already allocated model to grow. *           N     - seq length to allocate for; N+1 rows *           M     - size of model *           xmx, mmx, imx, dmx  *                 - RETURN: ptrs to four mx components as a convenience *                    * Return:   (void) *           mx is (re)allocated here. */voidResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M, 		  int ***xmx, int ***mmx, int ***imx, int ***dmx){  int i,n;  if (N <= mx->maxN && M <= mx->maxM)     {      if (xmx != NULL) *xmx = mx->xmx;      if (mmx != NULL) *mmx = mx->mmx;      if (imx != NULL) *imx = mx->imx;      if (dmx != NULL) *dmx = mx->dmx;      return;     }  if (N > mx->maxN) {    N          += mx->padN;     mx->maxN    = N;     mx->xmx     = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));    mx->mmx     = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));    mx->imx     = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));    mx->dmx     = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));  }  if (M > mx->maxM) {    M += mx->padM;     mx->maxM = M;   }  mx->xmx_mem = ReallocOrDie (mx->xmx_mem, sizeof(int) * (mx->maxN+1)*(5 + 16));  mx->mmx_mem = ReallocOrDie (mx->mmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));  mx->imx_mem = ReallocOrDie (mx->imx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));  mx->dmx_mem = ReallocOrDie (mx->dmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));    mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);  mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);  mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);  mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);    /* And make sure the beginning of each row is aligned the same way */  for (i = 1; i <= mx->maxN; i++)    {      mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */      n = 12 - (mx->maxM+2)%4;      mx->mmx[i] = mx->mmx[0] + i*(mx->maxM+2+n);      mx->imx[i] = mx->imx[0] + i*(mx->maxM+2+n);      mx->dmx[i] = mx->dmx[0] + i*(mx->maxM+2+n);    }   if (xmx != NULL) *xmx = mx->xmx;  if (mmx != NULL) *mmx = mx->mmx;  if (imx != NULL) *imx = mx->imx;  if (dmx != NULL) *dmx = mx->dmx;}/* Function: P7Viterbi() *  * Purpose:  The Viterbi dynamic programming algorithm; Altivec implementation *           by Erik Lindahl, Stanford University, 2002. *            * Args:     dsq    - sequence in digitized form *           L      - length of dsq *           hmm    - the model *           mx     - DP matrix (may get grown here) *           ret_tr - RETURN: traceback; pass NULL if it's not wanted *            * Return:   log P(S|M)/P(S|R), as a bit score *//* This first version of P7Viterbi has been accelerated with Altivec vectorization. * On Apple hardware, it is up to a factor 10 faster than the old non-altivec version. */typedef union {vector signed int v;int i[4];} ivector;void printivec(vector signed int z){    ivector q;    q.v=z;    printf("%d  %d  %d  %d\n",q.i[0],q.i[1],q.i[2],q.i[3]);}floatP7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr) {  struct p7trace_s  *tr;  int **xmx;  int **mmx;  int **imx;  int **dmx;  int  *mmxi,*xmxi,*imxi,*dmxi;  int  *lmmxi,*lxmxi,*limxi,*ldmxi;  int  *p_tmm,*p_tim,*p_tdm,*p_bsc,*p_msc,*p_tmd,*p_tdd,*p_tmi,*p_tii,*p_isc,*p_esc;  int   i,n,k;  int   sc;  /* gcc and motorola use different syntax for initializing vectors,   * so we use a dummy variable instead to get an adress to load from...   */     int t_lowscore = -INFTY;    /* vector variables. We avoid the stupid gcc register spill/fill code by   * limiting ourselves to 32 generic variables and making the all registers.   * (This reuse is the reason for the generic variable names).   */  register vector signed int v_lowscore;  register vector signed int max_mmxesc;  register vector signed int v_xmb;  register vector unsigned int mask1;  register vector unsigned int mask2;  register vector unsigned int mask3;  register vector unsigned int mask4;  register vector signed int v_lmmx1;  register vector signed int v_lmmx2;  register vector signed int v_limx1;  register vector signed int v_limx2;  register vector signed int v_save_lmmx;  register vector signed int v_save_ldmx;  register vector signed int v_save_limx;  register vector signed int v_save_mmx;  register vector signed int v_save_dmx;  register vector signed int v1;  register vector signed int v2;  register vector signed int v3;  register vector signed int v4;  register vector signed int v5;  register vector signed int v6;  register vector signed int v7;  register vector signed int v8;  register vector signed int v9;  register vector signed int v10;  register vector signed int v11;  register vector signed int v12;  register vector signed int v13;  register vector signed int v14;  register vector signed int v15;  /* load (-infinity) to all four elements in v_lowscore */  v_lowscore      = vec_lde(0, &t_lowscore );  mask1           = (vector unsigned int)vec_lvsl(0,&t_lowscore);  v_lowscore      = vec_perm(v_lowscore,v_lowscore,(vector unsigned char)mask1);  v_lowscore      = vec_splat(v_lowscore,0);    v1 = vec_splat_s32(-1);  v2 = vec_splat_s32(0);  mask1 = (vector unsigned int)vec_sld(v1,v2,12); /* FF in first pos, rest. are 00 */  mask2 = vec_sld(mask1,mask1,12);  mask3 = vec_sld(mask1,mask1,8);  mask4 = vec_sld(mask1,mask1,4);  /* Make sure our DP matrix has 0..L rows, 0..M columns; grow it if needed.    */  ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);  /* Initialization of the zero row.   */  xmx[0][XMN] = 0;		                     /* S->N, p=1            */  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */    mmxi=mmx[0];  imxi=imx[0];  dmxi=dmx[0];  xmxi=xmx[0];    for (n = 0; n  < 5+hmm->M; n+=4) {    vec_st(v_lowscore, n*4, mmxi);    vec_st(v_lowscore, n*4, imxi);    vec_st(v_lowscore, n*4, dmxi);  }  /* Fill data beyound M with -INFTY, so we can take the maximum including   * elements with k>M.   */  for(k=1+hmm->M;k<=(3+hmm->M);k++) {      hmm->esc[k]=-INFTY;      hmm->bsc[k]=-INFTY;      for(i=0;i<7;i++)	hmm->tsc[i][k]=-INFTY;      for(i=0;i<MAXCODE;i++) {	hmm->msc[i][k]=-INFTY;	hmm->isc[i][k]=-INFTY;      }  }    /* Recursion. Done as a pull   * Note some slightly wasteful boundary conditions:     *    tsc[0] = -INFTY for all eight transitions (no node 0)   *    D_M and I_M are wastefully calculated (they don't exist)   */  for (i = 1; i <= L; i++) {    /* pointers to last (i-1) row */    lmmxi=mmxi;    limxi=imxi;    ldmxi=dmxi;    lxmxi=xmxi;    /* get pointers to this row */    mmxi=mmx[i];    imxi=imx[i];    dmxi=dmx[i];    xmxi=xmx[i];        /* Set everything that doesnt depend on k here */            /* load and splat (spread to all elements) XMX[i-1][XMB] */    v13   = vec_lde(0,&(xmx[i-1][XMB]));    v14   = (vector signed int)vec_lvsl(0,&(xmx[i-1][XMB]));    v13   = vec_perm(v13,v13,(vector unsigned char)v14);    v_xmb = vec_splat(v13,0);    p_tmm = hmm->tsc[TMM];    p_tim = hmm->tsc[TIM];    p_tdm = hmm->tsc[TDM];    p_bsc = hmm->bsc;    k = dsq[i];    p_msc = hmm->msc[k];    p_isc = hmm->isc[k];    p_tmd = hmm->tsc[TMD];    p_tdd = hmm->tsc[TDD];    p_tmi = hmm->tsc[TMI];    p_tii = hmm->tsc[TII];    p_esc = hmm->esc;    max_mmxesc = v_lowscore;        /* the 0 element of vectors are aligned 12 bytes up from the 16-byte boundary,     * so we simply write the entire 16 bytes before the 16 byte boundary.     */    vec_st(v_lowscore,0,mmxi);    vec_st(v_lowscore,0,imxi);    vec_st(v_lowscore,0,dmxi);        /* Load the first (i.e. 'previous') vector on last row for mmx,imx,dmx,     * and load the first vector of dmx and mmx on this row.     */    v_save_lmmx = vec_ld(-12, lmmxi);    v_save_limx = vec_ld(-12, limxi);    v_save_ldmx = vec_ld(-12, ldmxi);    v_save_mmx = vec_ld(-12, mmxi);    v_save_dmx = vec_ld(-12, dmxi);    /* we have allocated extra memory, so it is perfectly OK     * to do the calculations for a couple of extra cells where     * k>hmm->M. These cells just wont be used.     */    for (n = 4, k=1; k < (hmm->M-3) ; n+=32, k+=8) {	/* match state */	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,	* but do it for 8 elements in parallel.	* Since we are comparing with data on the previous row but one position	* earlier, we have to shift stuff. Load two new vectors each round,	* and use the saved register from last round. 	*/	/* load mmx data */	v_lmmx1 = vec_ld(n,    lmmxi);	v_lmmx2 = vec_ld(n+16, lmmxi);	/* load imx data */	v_limx1 = vec_ld(n,    limxi);	v_limx2 = vec_ld(n+16, limxi);	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */	v10   = vec_ld(n+16, ldmxi);		/* shift mmx, imx & dmx data */	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);	v3    = vec_sld(v_save_limx,v_limx1,12);	v9    = vec_sld(v_save_ldmx,v5,12);	/* shift mmx, imx & dmx data */	v2    = vec_sld(v_lmmx1,v_lmmx2,12);	v4    = vec_sld(v_limx1,v_limx2,12);	v_save_ldmx = v10;	v10   = vec_sld(v5,v10,12);		v_save_lmmx = v_lmmx2;	v_save_limx = v_limx2;		/* v1,v2 now contains 8 element with mmx[i-1][k-1], 	 * v3,v4 contain 8 elements with imx[i-1][k-1],         * and v9,v10 contain 8 elements with dmx[i-1][k-1].	 */	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */        v5    = vec_ld(n-4, p_tmm);        v6    = vec_ld(n+12, p_tmm);        v7    = vec_ld(n-4, p_tim);        v8    = vec_ld(n+12, p_tim);        v11   = vec_ld(n-4, p_tdm);        v12   = vec_ld(n+12, p_tdm);	/* load bsc[k] */	v14   = vec_ld(n, p_bsc);	v15   = vec_ld(n+16, p_bsc);		/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so          * we don't loop if we add the large negative numbers used for -infinity.	 */	v1    = vec_adds(v1,v5);	v2    = vec_adds(v2,v6);	v3    = vec_adds(v3,v7);	v4    = vec_adds(v4,v8);	v9    = vec_adds(v9,v11);	v10   = vec_adds(v10,v12);	v14   = vec_adds(v14,v_xmb);	v15   = vec_adds(v15,v_xmb);	/* Select max of mmx+TMM and imx+TIM in each element */	v1    = vec_max(v1,v3);	v2    = vec_max(v2,v4);	/* select max of dmx+TDM and XMX+bsc */	v9    = vec_max(v9,v14);	v10   = vec_max(v10,v15);	        /* select max of the four alternatives */	v1    = vec_max(v1,v9);	v2    = vec_max(v2,v10);	/* v1,v2 now contain the max values for the new mmx;	 * check if we should add msc.         */        v3    = vec_ld(n,    p_msc);        v4    = vec_ld(n+16, p_msc);        v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);        v6    = (vector signed int)vec_cmpgt(v4,v_lowscore);	/* load esc[k] */	v9    = vec_ld(n, p_esc);	v10   = vec_ld(n+16, p_esc);		v1    = vec_adds(v1,v3);	v2    = vec_adds(v2,v4);        v1    = vec_sel(v3,v1,(vector unsigned int)v5);        v2    = vec_sel(v4,v2,(vector unsigned int)v6);      	/* have final values for mmx on this row in v1,v2 - store it */	vec_st(v1, n,    mmxi);	vec_st(v2, n+16, mmxi);	v9    = vec_adds(v9,v1);	v10   = vec_adds(v10,v2);	v9    = vec_max(v9,v10);	max_mmxesc = vec_max(v9,max_mmxesc);		/* OK. We finished the match state. Normally we could now first	 * do the delete and then the insertion. The problem is that the	 * delete is a pain in the ass to vectorize, since each element	 * depends on the previous one in the vector. This means I have

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?