fast_algorithms.c
来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,087 行 · 第 1/3 页
C
1,087 行
* * See additional comments in * core_algorithms.c:ResizePlan7Matrix(), the normal version * of this function. This version is only used in the * Altivec (--enable-altivec) port. * * Returns individual ptrs to the four matrix components * as a convenience. * * Args: mx - an already allocated model to grow. * N - seq length to allocate for; N+1 rows * M - size of model * xmx, mmx, imx, dmx * - RETURN: ptrs to four mx components as a convenience * * Return: (void) * mx is (re)allocated here. */voidResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx){ int i,n; if (N <= mx->maxN && M <= mx->maxM) { if (xmx != NULL) *xmx = mx->xmx; if (mmx != NULL) *mmx = mx->mmx; if (imx != NULL) *imx = mx->imx; if (dmx != NULL) *dmx = mx->dmx; return; } if (N > mx->maxN) { N += mx->padN; mx->maxN = N; mx->xmx = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1)); mx->mmx = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1)); mx->imx = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1)); mx->dmx = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1)); } if (M > mx->maxM) { M += mx->padM; mx->maxM = M; } mx->xmx_mem = ReallocOrDie (mx->xmx_mem, sizeof(int) * (mx->maxN+1)*(5 + 16)); mx->mmx_mem = ReallocOrDie (mx->mmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16)); mx->imx_mem = ReallocOrDie (mx->imx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16)); mx->dmx_mem = ReallocOrDie (mx->dmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16)); mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12); mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12); mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12); mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12); /* And make sure the beginning of each row is aligned the same way */ for (i = 1; i <= mx->maxN; i++) { mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */ n = 12 - (mx->maxM+2)%4; mx->mmx[i] = mx->mmx[0] + i*(mx->maxM+2+n); mx->imx[i] = mx->imx[0] + i*(mx->maxM+2+n); mx->dmx[i] = mx->dmx[0] + i*(mx->maxM+2+n); } if (xmx != NULL) *xmx = mx->xmx; if (mmx != NULL) *mmx = mx->mmx; if (imx != NULL) *imx = mx->imx; if (dmx != NULL) *dmx = mx->dmx;}/* Function: P7Viterbi() * * Purpose: The Viterbi dynamic programming algorithm; Altivec implementation * by Erik Lindahl, Stanford University, 2002. * * Args: dsq - sequence in digitized form * L - length of dsq * hmm - the model * mx - DP matrix (may get grown here) * ret_tr - RETURN: traceback; pass NULL if it's not wanted * * Return: log P(S|M)/P(S|R), as a bit score *//* This first version of P7Viterbi has been accelerated with Altivec vectorization. * On Apple hardware, it is up to a factor 10 faster than the old non-altivec version. */typedef union {vector signed int v;int i[4];} ivector;void printivec(vector signed int z){ ivector q; q.v=z; printf("%d %d %d %d\n",q.i[0],q.i[1],q.i[2],q.i[3]);}floatP7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr) { struct p7trace_s *tr; int **xmx; int **mmx; int **imx; int **dmx; int *mmxi,*xmxi,*imxi,*dmxi; int *lmmxi,*lxmxi,*limxi,*ldmxi; int *p_tmm,*p_tim,*p_tdm,*p_bsc,*p_msc,*p_tmd,*p_tdd,*p_tmi,*p_tii,*p_isc,*p_esc; int i,n,k; int sc; /* gcc and motorola use different syntax for initializing vectors, * so we use a dummy variable instead to get an adress to load from... */ int t_lowscore = -INFTY; /* vector variables. We avoid the stupid gcc register spill/fill code by * limiting ourselves to 32 generic variables and making the all registers. * (This reuse is the reason for the generic variable names). */ register vector signed int v_lowscore; register vector signed int max_mmxesc; register vector signed int v_xmb; register vector unsigned int mask1; register vector unsigned int mask2; register vector unsigned int mask3; register vector unsigned int mask4; register vector signed int v_lmmx1; register vector signed int v_lmmx2; register vector signed int v_limx1; register vector signed int v_limx2; register vector signed int v_save_lmmx; register vector signed int v_save_ldmx; register vector signed int v_save_limx; register vector signed int v_save_mmx; register vector signed int v_save_dmx; register vector signed int v1; register vector signed int v2; register vector signed int v3; register vector signed int v4; register vector signed int v5; register vector signed int v6; register vector signed int v7; register vector signed int v8; register vector signed int v9; register vector signed int v10; register vector signed int v11; register vector signed int v12; register vector signed int v13; register vector signed int v14; register vector signed int v15; /* load (-infinity) to all four elements in v_lowscore */ v_lowscore = vec_lde(0, &t_lowscore ); mask1 = (vector unsigned int)vec_lvsl(0,&t_lowscore); v_lowscore = vec_perm(v_lowscore,v_lowscore,(vector unsigned char)mask1); v_lowscore = vec_splat(v_lowscore,0); v1 = vec_splat_s32(-1); v2 = vec_splat_s32(0); mask1 = (vector unsigned int)vec_sld(v1,v2,12); /* FF in first pos, rest. are 00 */ mask2 = vec_sld(mask1,mask1,12); mask3 = vec_sld(mask1,mask1,8); mask4 = vec_sld(mask1,mask1,4); /* Make sure our DP matrix has 0..L rows, 0..M columns; grow it if needed. */ ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx); /* Initialization of the zero row. */ xmx[0][XMN] = 0; /* S->N, p=1 */ xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ mmxi=mmx[0]; imxi=imx[0]; dmxi=dmx[0]; xmxi=xmx[0]; for (n = 0; n < 5+hmm->M; n+=4) { vec_st(v_lowscore, n*4, mmxi); vec_st(v_lowscore, n*4, imxi); vec_st(v_lowscore, n*4, dmxi); } /* Fill data beyound M with -INFTY, so we can take the maximum including * elements with k>M. */ for(k=1+hmm->M;k<=(3+hmm->M);k++) { hmm->esc[k]=-INFTY; hmm->bsc[k]=-INFTY; for(i=0;i<7;i++) hmm->tsc[i][k]=-INFTY; for(i=0;i<MAXCODE;i++) { hmm->msc[i][k]=-INFTY; hmm->isc[i][k]=-INFTY; } } /* Recursion. Done as a pull * Note some slightly wasteful boundary conditions: * tsc[0] = -INFTY for all eight transitions (no node 0) * D_M and I_M are wastefully calculated (they don't exist) */ for (i = 1; i <= L; i++) { /* pointers to last (i-1) row */ lmmxi=mmxi; limxi=imxi; ldmxi=dmxi; lxmxi=xmxi; /* get pointers to this row */ mmxi=mmx[i]; imxi=imx[i]; dmxi=dmx[i]; xmxi=xmx[i]; /* Set everything that doesnt depend on k here */ /* load and splat (spread to all elements) XMX[i-1][XMB] */ v13 = vec_lde(0,&(xmx[i-1][XMB])); v14 = (vector signed int)vec_lvsl(0,&(xmx[i-1][XMB])); v13 = vec_perm(v13,v13,(vector unsigned char)v14); v_xmb = vec_splat(v13,0); p_tmm = hmm->tsc[TMM]; p_tim = hmm->tsc[TIM]; p_tdm = hmm->tsc[TDM]; p_bsc = hmm->bsc; k = dsq[i]; p_msc = hmm->msc[k]; p_isc = hmm->isc[k]; p_tmd = hmm->tsc[TMD]; p_tdd = hmm->tsc[TDD]; p_tmi = hmm->tsc[TMI]; p_tii = hmm->tsc[TII]; p_esc = hmm->esc; max_mmxesc = v_lowscore; /* the 0 element of vectors are aligned 12 bytes up from the 16-byte boundary, * so we simply write the entire 16 bytes before the 16 byte boundary. */ vec_st(v_lowscore,0,mmxi); vec_st(v_lowscore,0,imxi); vec_st(v_lowscore,0,dmxi); /* Load the first (i.e. 'previous') vector on last row for mmx,imx,dmx, * and load the first vector of dmx and mmx on this row. */ v_save_lmmx = vec_ld(-12, lmmxi); v_save_limx = vec_ld(-12, limxi); v_save_ldmx = vec_ld(-12, ldmxi); v_save_mmx = vec_ld(-12, mmxi); v_save_dmx = vec_ld(-12, dmxi); /* we have allocated extra memory, so it is perfectly OK * to do the calculations for a couple of extra cells where * k>hmm->M. These cells just wont be used. */ for (n = 4, k=1; k < (hmm->M-3) ; n+=32, k+=8) { /* match state */ /* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better, * but do it for 8 elements in parallel. * Since we are comparing with data on the previous row but one position * earlier, we have to shift stuff. Load two new vectors each round, * and use the saved register from last round. */ /* load mmx data */ v_lmmx1 = vec_ld(n, lmmxi); v_lmmx2 = vec_ld(n+16, lmmxi); /* load imx data */ v_limx1 = vec_ld(n, limxi); v_limx2 = vec_ld(n+16, limxi); v5 = vec_ld(n, ldmxi); /* Load dmx data */ v10 = vec_ld(n+16, ldmxi); /* shift mmx, imx & dmx data */ v1 = vec_sld(v_save_lmmx,v_lmmx1,12); v3 = vec_sld(v_save_limx,v_limx1,12); v9 = vec_sld(v_save_ldmx,v5,12); /* shift mmx, imx & dmx data */ v2 = vec_sld(v_lmmx1,v_lmmx2,12); v4 = vec_sld(v_limx1,v_limx2,12); v_save_ldmx = v10; v10 = vec_sld(v5,v10,12); v_save_lmmx = v_lmmx2; v_save_limx = v_limx2; /* v1,v2 now contains 8 element with mmx[i-1][k-1], * v3,v4 contain 8 elements with imx[i-1][k-1], * and v9,v10 contain 8 elements with dmx[i-1][k-1]. */ /* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */ v5 = vec_ld(n-4, p_tmm); v6 = vec_ld(n+12, p_tmm); v7 = vec_ld(n-4, p_tim); v8 = vec_ld(n+12, p_tim); v11 = vec_ld(n-4, p_tdm); v12 = vec_ld(n+12, p_tdm); /* load bsc[k] */ v14 = vec_ld(n, p_bsc); v15 = vec_ld(n+16, p_bsc); /* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so * we don't loop if we add the large negative numbers used for -infinity. */ v1 = vec_adds(v1,v5); v2 = vec_adds(v2,v6); v3 = vec_adds(v3,v7); v4 = vec_adds(v4,v8); v9 = vec_adds(v9,v11); v10 = vec_adds(v10,v12); v14 = vec_adds(v14,v_xmb); v15 = vec_adds(v15,v_xmb); /* Select max of mmx+TMM and imx+TIM in each element */ v1 = vec_max(v1,v3); v2 = vec_max(v2,v4); /* select max of dmx+TDM and XMX+bsc */ v9 = vec_max(v9,v14); v10 = vec_max(v10,v15); /* select max of the four alternatives */ v1 = vec_max(v1,v9); v2 = vec_max(v2,v10); /* v1,v2 now contain the max values for the new mmx; * check if we should add msc. */ v3 = vec_ld(n, p_msc); v4 = vec_ld(n+16, p_msc); v5 = (vector signed int)vec_cmpgt(v3,v_lowscore); v6 = (vector signed int)vec_cmpgt(v4,v_lowscore); /* load esc[k] */ v9 = vec_ld(n, p_esc); v10 = vec_ld(n+16, p_esc); v1 = vec_adds(v1,v3); v2 = vec_adds(v2,v4); v1 = vec_sel(v3,v1,(vector unsigned int)v5); v2 = vec_sel(v4,v2,(vector unsigned int)v6); /* have final values for mmx on this row in v1,v2 - store it */ vec_st(v1, n, mmxi); vec_st(v2, n+16, mmxi); v9 = vec_adds(v9,v1); v10 = vec_adds(v10,v2); v9 = vec_max(v9,v10); max_mmxesc = vec_max(v9,max_mmxesc); /* OK. We finished the match state. Normally we could now first * do the delete and then the insertion. The problem is that the * delete is a pain in the ass to vectorize, since each element * depends on the previous one in the vector. This means I have
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?