fast_algorithms.c

来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,087 行 · 第 1/3 页

C
1,087
字号
	 * to write this relatively simple operation as eight independent	 * ones, just as we would have done in a non-vectorized code.	 * Since this is independent of the insertion state changes, I	 * try to hide the latencies by doing the delete and insert	 * calculations in parallel.         * To make things easier I add 'del' in a comment on each	 * line for calculations that are on the delete state, and 'ins'	 * for the calculations for the insertion. Hang on...	 */	    	/* We already have the match data on this row from the previous	 * iteration in v_save_mmx. Rotate it so the element that used to be	 * is pos 4 last iteration is in position 1, and pos 2-4 contain	 * the the first three elements of mmx this iteration.         * And do the same type of rotation for v1/v2...	 */	v4 = vec_sld(v1,v2,12); /* del */        v3 = vec_sld(v_save_mmx,v1,12); /* del */	v_save_mmx = v2; /* Save for next iteration */	/* Rotate last dmx data so we have the fourth element in pos 1. */	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */	        /* load TMD & TDD data */        v5   = vec_ld(n-4, p_tmd); /* del */	v6   = vec_ld(n+12, p_tmd); /* del */        v7   = vec_ld(n-4, p_tdd); /* del */	v8   = vec_ld(n+12, p_tdd); /* del */		/* calculate mmx+TMD */	v3   = vec_adds(v3,v5); /* del */	v4   = vec_adds(v4,v6); /* del */	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to	 * it and compare with v3/v4 (data from mmx+TMD alternative), but	 * we only compare & replace the first position, so we use a mask!	 */        /* First position: Add TDD to v_save_dmx */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	/* use a mask to select only the first element from v_save_dmx, rest from v3. */	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */	/* Start doing the insertion calculations in parallel.	 * Load the TMI data.	 */        v9    = vec_ld(n   , p_tmi); /* ins */	v10   = vec_ld(n+16, p_tmi); /* ins */	/* Deletion:         * Now we have an accurate pos 1 in v3. continue with pos2 the same         * way. Rotate to a temporary array, add to TDD, compare and use a mask         * to write back to only position 2.	 */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */	/* More insertion stuff - load TII data */	v11   = vec_ld(n   , p_tii); /* ins */	v12   = vec_ld(n+16, p_tii); /* ins */	/* Deletion, position 3... */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */	/* insertion stuff: calculate mmx+TMI */	v9     = vec_adds(v_lmmx1,v9); /* ins */	v10    = vec_adds(v_lmmx2,v10); /* ins */	/* Deletion, position 4 */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */	/* insertion stuff: calculate imx+TII */	v11    = vec_adds(v_limx1,v11); /* ins */	v12    = vec_adds(v_limx2,v12); /* ins */	/* That was the first deletion vector, but we are unrolling.	 * The next step is position '5', i.e. the first in vector 2.	 * This one depends on the last position of vector 1, which we just finished.	 */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */	v_save_dmx = vec_max(v_save_dmx,v4); /* del */	v4     = vec_sel(v4,v_save_dmx,mask1); /* del */	/* insertion stuff: select max of mmx+TMI and imx+TII */	v9     = vec_max(v9,v11); /* ins */	v10    = vec_max(v10,v12); /* ins */	/* insertion stuff: load data from hmm->isc[tmpidx] */	v11    = vec_ld(n, p_isc); /* ins */	v12    = vec_ld(n+16, p_isc); /* ins */		/* position 6 (2 in vector 2) */	v_save_dmx = vec_sld(v4,v4,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */	v_save_dmx = vec_max(v_save_dmx,v4); /* del */	v4     = vec_sel(v4,v_save_dmx,mask2); /* del */	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);	v14    = (vector signed int)vec_cmpgt(v12,v_lowscore);		/* position 7 (3 in vector 2) */	v_save_dmx = vec_sld(v4,v4,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */	v_save_dmx = vec_max(v_save_dmx,v4); /* del */	v4     = vec_sel(v4,v_save_dmx,mask3); /* del */	v9     = vec_adds(v9,v11);	v10    = vec_adds(v10,v12);	v9     = vec_sel(v11,v9,(vector unsigned int)v13);	v10    = vec_sel(v12,v10,(vector unsigned int)v14);		/* position 8 (4 in vector 2) */	v_save_dmx = vec_sld(v4,v4,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */	v_save_dmx = vec_max(v_save_dmx,v4); /* del */	v_save_dmx = vec_sel(v4,v_save_dmx,mask4); /* del */	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;	 * save it to memory. (v_save_dmx will be used next iteration)	 */	vec_st(v3,        n, dmxi); /* del */	vec_st(v_save_dmx, n+16, dmxi); /* del */		/* save insertion too */	vec_st(v9, n, imxi);	vec_st(v10,n+16,imxi);	     }    /* odd loop */    if(k< (1+hmm->M)) {	/* match state */	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,	* but do it for 4 elements in parallel.	* Since we are comparing with data on the previous row but one position	* earlier, we have to shift stuff. Load two new vectors each round,	* and use the saved register from last round.	*/	/* load mmx data */	v_lmmx1 = vec_ld(n,    lmmxi);	/* load imx data */	v_limx1 = vec_ld(n,    limxi);	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */	/* shift mmx, imx & dmx data */	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);	v3    = vec_sld(v_save_limx,v_limx1,12);	v9    = vec_sld(v_save_ldmx,v5,12);	/* v1,v2 now contains 8 element with mmx[i-1][k-1],	    * v3,v4 contain 8 elements with imx[i-1][k-1],	    * and v9,v10 contain 8 elements with dmx[i-1][k-1].	    */	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */        v5    = vec_ld(n-4, p_tmm);        v7    = vec_ld(n-4, p_tim);        v11   = vec_ld(n-4, p_tdm);	/* load bsc[k] */	v14   = vec_ld(n, p_bsc);	/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so	    * we don't loop if we add the large negative numbers used for -infinity.	    */	v1    = vec_adds(v1,v5);	v3    = vec_adds(v3,v7);	v9    = vec_adds(v9,v11);	v14   = vec_adds(v14,v_xmb);	/* Select max of mmx+TMM and imx+TIM in each element */	v1    = vec_max(v1,v3);	/* select max of dmx+TDM and XMX+bsc */	v9    = vec_max(v9,v14);        /* select max of the four alternatives */	v1    = vec_max(v1,v9);	/* v1,v2 now contain the max values for the new mmx;	* check if we should add msc.	    */        v3    = vec_ld(n,    p_msc);        v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);	/* load esc[k] */	v9    = vec_ld(n, p_esc);	v1    = vec_adds(v1,v3);        v1    = vec_sel(v3,v1,(vector unsigned int)v5);	/* have final values for mmx on this row in v1,v2 - store it */	vec_st(v1, n,    mmxi);	v9    = vec_adds(v9,v1);	max_mmxesc = vec_max(v9,max_mmxesc);		/* OK. We finished the match state. Normally we could now first	    * do the delete and then the insertion. The problem is that the	    * delete is a pain in the ass to vectorize, since each element	    * depends on the previous one in the vector. This means I have	    * to write this relatively simple operation as eight independent	    * ones, just as we would have done in a non-vectorized code.	    * Since this is independent of the insertion state changes, I	    * try to hide the latencies by doing the delete and insert	    * calculations in parallel.	    * To make things easier I add 'del' in a comment on each	    * line for calculations that are on the delete state, and 'ins'	    * for the calculations for the insertion. Hang on...	    */	/* We already have the match data on this row from the previous	    * iteration in v_save_mmx. Rotate it so the element that used to be	    * is pos 4 last iteration is in position 1, and pos 2-4 contain	    * the the first three elements of mmx this iteration.	    * And do the same type of rotation for v1/v2...	    */        v3 = vec_sld(v_save_mmx,v1,12); /* del */	/* Rotate last dmx data so we have the fourth element in pos 1. */	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */        /* load TMD & TDD data */        v5   = vec_ld(n-4, p_tmd); /* del */        v7   = vec_ld(n-4, p_tdd); /* del */	/* calculate mmx+TMD */	v3   = vec_adds(v3,v5); /* del */	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to	    * it and compare with v3/v4 (data from mmx+TMD alternative), but	    * we only compare & replace the first position, so we use a mask!	    */        /* First position: Add TDD to v_save_dmx */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	/* use a mask to select only the first element from v_save_dmx, rest from v3. */	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */	/* Start doing the insertion calculations in parallel.	    * Load the TMI data.	    */        v9    = vec_ld(n   , p_tmi); /* ins */	/* Deletion:	    * Now we have an accurate pos 1 in v3. continue with pos2 the same	    * way. Rotate to a temporary array, add to TDD, compare and use a mask	    * to write back to only position 2.	    */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */	/* More insertion stuff - load TII data */	v11   = vec_ld(n   , p_tii); /* ins */	/* Deletion, position 3... */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */	/* insertion stuff: calculate mmx+TMI */	v9     = vec_adds(v_lmmx1,v9); /* ins */	/* Deletion, position 4 */	v_save_dmx = vec_sld(v3,v3,12); /* del */	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */	v_save_dmx = vec_max(v_save_dmx,v3); /* del */	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */	/* insertion stuff: calculate imx+TII */	v11    = vec_adds(v_limx1,v11); /* ins */	/* insertion stuff: select max of mmx+TMI and imx+TII */	v9     = vec_max(v9,v11); /* ins */	/* insertion stuff: load data from hmm->isc[tmpidx] */	v11    = vec_ld(n, p_isc); /* ins */	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);	v9     = vec_adds(v9,v11);	v9     = vec_sel(v11,v9,(vector unsigned int)v13);	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;	* save it to memory. (v_save_dmx will be used next iteration)	    */	vec_st(v3,        n, dmxi); /* del */	/* save insertion too */	vec_st(v9, n, imxi);    }    /* end of k loops */    /* Now the special states. Order is important here.	* remember, C and J emissions are zero score by definition,	*/				/* N state */    xmx[i][XMN] = -INFTY;    if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)	xmx[i][XMN] = sc;				/* E state */    v2 = vec_sld(max_mmxesc,max_mmxesc,8);    v2 = vec_max(v2,max_mmxesc);    v1 = vec_sld(v2,v2,4);    v1 = vec_max(v1,v2);    vec_ste(v1,XME*4,xmxi);    				/* J state */    xmxi[XMJ] = -INFTY;    if ((sc = lxmxi[XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)      xmxi[XMJ] = sc;    if ((sc = xmxi[XME]   + hmm->xsc[XTE][LOOP]) > xmxi[XMJ])      xmxi[XMJ] = sc;				/* B state */    xmxi[XMB] = -INFTY;    if ((sc = xmxi[XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)      xmxi[XMB] = sc;    if ((sc = xmxi[XMJ] + hmm->xsc[XTJ][MOVE]) > xmxi[XMB])      xmxi[XMB] = sc;				/* C state */    xmxi[XMC] = -INFTY;    if ((sc = lxmxi[XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)      xmxi[XMC] = sc;    if ((sc = xmxi[XME] + hmm->xsc[XTE][MOVE]) > xmxi[XMC])      xmxi[XMC] = sc;  }  /* T state (not stored) */  sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];  if (ret_tr != NULL) {    P7ViterbiTrace(hmm, dsq, L, mx, &tr);    *ret_tr = tr;   }    /* Note (Lindahl): we do NOT free the dpmatrix here anymore - the code was    * spending 30% of the runtime allocating/freeing memory.   * Provide a pointer to a dpmatrix_s structure to this routine,   * and we try to reuse it. After the final call to P7Viterbi,   * free it with FreePlan7Matrix.   */  return Scorify(sc);		/* the total Viterbi score. */}#endif /*the ALTIVEC port*/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?