fast_algorithms.c
来自「这是一个基于HMM 模型的生物多序列比对算法的linux实现版本。hmmer」· C语言 代码 · 共 1,087 行 · 第 1/3 页
C
1,087 行
* to write this relatively simple operation as eight independent * ones, just as we would have done in a non-vectorized code. * Since this is independent of the insertion state changes, I * try to hide the latencies by doing the delete and insert * calculations in parallel. * To make things easier I add 'del' in a comment on each * line for calculations that are on the delete state, and 'ins' * for the calculations for the insertion. Hang on... */ /* We already have the match data on this row from the previous * iteration in v_save_mmx. Rotate it so the element that used to be * is pos 4 last iteration is in position 1, and pos 2-4 contain * the the first three elements of mmx this iteration. * And do the same type of rotation for v1/v2... */ v4 = vec_sld(v1,v2,12); /* del */ v3 = vec_sld(v_save_mmx,v1,12); /* del */ v_save_mmx = v2; /* Save for next iteration */ /* Rotate last dmx data so we have the fourth element in pos 1. */ v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */ /* load TMD & TDD data */ v5 = vec_ld(n-4, p_tmd); /* del */ v6 = vec_ld(n+12, p_tmd); /* del */ v7 = vec_ld(n-4, p_tdd); /* del */ v8 = vec_ld(n+12, p_tdd); /* del */ /* calculate mmx+TMD */ v3 = vec_adds(v3,v5); /* del */ v4 = vec_adds(v4,v6); /* del */ /* Start the ugly stuff. We have rotated last dmx data. Add TDD to * it and compare with v3/v4 (data from mmx+TMD alternative), but * we only compare & replace the first position, so we use a mask! */ /* First position: Add TDD to v_save_dmx */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ /* Select max of this and mmx+TMD and save it temporary to v_save_dmx */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ /* use a mask to select only the first element from v_save_dmx, rest from v3. */ v3 = vec_sel(v3,v_save_dmx,mask1); /* del */ /* Start doing the insertion calculations in parallel. * Load the TMI data. */ v9 = vec_ld(n , p_tmi); /* ins */ v10 = vec_ld(n+16, p_tmi); /* ins */ /* Deletion: * Now we have an accurate pos 1 in v3. continue with pos2 the same * way. Rotate to a temporary array, add to TDD, compare and use a mask * to write back to only position 2. */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask2); /* del */ /* More insertion stuff - load TII data */ v11 = vec_ld(n , p_tii); /* ins */ v12 = vec_ld(n+16, p_tii); /* ins */ /* Deletion, position 3... */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask3); /* del */ /* insertion stuff: calculate mmx+TMI */ v9 = vec_adds(v_lmmx1,v9); /* ins */ v10 = vec_adds(v_lmmx2,v10); /* ins */ /* Deletion, position 4 */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask4); /* del */ /* insertion stuff: calculate imx+TII */ v11 = vec_adds(v_limx1,v11); /* ins */ v12 = vec_adds(v_limx2,v12); /* ins */ /* That was the first deletion vector, but we are unrolling. * The next step is position '5', i.e. the first in vector 2. * This one depends on the last position of vector 1, which we just finished. */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v8); /* del */ v_save_dmx = vec_max(v_save_dmx,v4); /* del */ v4 = vec_sel(v4,v_save_dmx,mask1); /* del */ /* insertion stuff: select max of mmx+TMI and imx+TII */ v9 = vec_max(v9,v11); /* ins */ v10 = vec_max(v10,v12); /* ins */ /* insertion stuff: load data from hmm->isc[tmpidx] */ v11 = vec_ld(n, p_isc); /* ins */ v12 = vec_ld(n+16, p_isc); /* ins */ /* position 6 (2 in vector 2) */ v_save_dmx = vec_sld(v4,v4,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v8); /* del */ v_save_dmx = vec_max(v_save_dmx,v4); /* del */ v4 = vec_sel(v4,v_save_dmx,mask2); /* del */ /* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */ v13 = (vector signed int)vec_cmpgt(v11,v_lowscore); v14 = (vector signed int)vec_cmpgt(v12,v_lowscore); /* position 7 (3 in vector 2) */ v_save_dmx = vec_sld(v4,v4,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v8); /* del */ v_save_dmx = vec_max(v_save_dmx,v4); /* del */ v4 = vec_sel(v4,v_save_dmx,mask3); /* del */ v9 = vec_adds(v9,v11); v10 = vec_adds(v10,v12); v9 = vec_sel(v11,v9,(vector unsigned int)v13); v10 = vec_sel(v12,v10,(vector unsigned int)v14); /* position 8 (4 in vector 2) */ v_save_dmx = vec_sld(v4,v4,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v8); /* del */ v_save_dmx = vec_max(v_save_dmx,v4); /* del */ v_save_dmx = vec_sel(v4,v_save_dmx,mask4); /* del */ /* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data; * save it to memory. (v_save_dmx will be used next iteration) */ vec_st(v3, n, dmxi); /* del */ vec_st(v_save_dmx, n+16, dmxi); /* del */ /* save insertion too */ vec_st(v9, n, imxi); vec_st(v10,n+16,imxi); } /* odd loop */ if(k< (1+hmm->M)) { /* match state */ /* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better, * but do it for 4 elements in parallel. * Since we are comparing with data on the previous row but one position * earlier, we have to shift stuff. Load two new vectors each round, * and use the saved register from last round. */ /* load mmx data */ v_lmmx1 = vec_ld(n, lmmxi); /* load imx data */ v_limx1 = vec_ld(n, limxi); v5 = vec_ld(n, ldmxi); /* Load dmx data */ /* shift mmx, imx & dmx data */ v1 = vec_sld(v_save_lmmx,v_lmmx1,12); v3 = vec_sld(v_save_limx,v_limx1,12); v9 = vec_sld(v_save_ldmx,v5,12); /* v1,v2 now contains 8 element with mmx[i-1][k-1], * v3,v4 contain 8 elements with imx[i-1][k-1], * and v9,v10 contain 8 elements with dmx[i-1][k-1]. */ /* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */ v5 = vec_ld(n-4, p_tmm); v7 = vec_ld(n-4, p_tim); v11 = vec_ld(n-4, p_tdm); /* load bsc[k] */ v14 = vec_ld(n, p_bsc); /* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so * we don't loop if we add the large negative numbers used for -infinity. */ v1 = vec_adds(v1,v5); v3 = vec_adds(v3,v7); v9 = vec_adds(v9,v11); v14 = vec_adds(v14,v_xmb); /* Select max of mmx+TMM and imx+TIM in each element */ v1 = vec_max(v1,v3); /* select max of dmx+TDM and XMX+bsc */ v9 = vec_max(v9,v14); /* select max of the four alternatives */ v1 = vec_max(v1,v9); /* v1,v2 now contain the max values for the new mmx; * check if we should add msc. */ v3 = vec_ld(n, p_msc); v5 = (vector signed int)vec_cmpgt(v3,v_lowscore); /* load esc[k] */ v9 = vec_ld(n, p_esc); v1 = vec_adds(v1,v3); v1 = vec_sel(v3,v1,(vector unsigned int)v5); /* have final values for mmx on this row in v1,v2 - store it */ vec_st(v1, n, mmxi); v9 = vec_adds(v9,v1); max_mmxesc = vec_max(v9,max_mmxesc); /* OK. We finished the match state. Normally we could now first * do the delete and then the insertion. The problem is that the * delete is a pain in the ass to vectorize, since each element * depends on the previous one in the vector. This means I have * to write this relatively simple operation as eight independent * ones, just as we would have done in a non-vectorized code. * Since this is independent of the insertion state changes, I * try to hide the latencies by doing the delete and insert * calculations in parallel. * To make things easier I add 'del' in a comment on each * line for calculations that are on the delete state, and 'ins' * for the calculations for the insertion. Hang on... */ /* We already have the match data on this row from the previous * iteration in v_save_mmx. Rotate it so the element that used to be * is pos 4 last iteration is in position 1, and pos 2-4 contain * the the first three elements of mmx this iteration. * And do the same type of rotation for v1/v2... */ v3 = vec_sld(v_save_mmx,v1,12); /* del */ /* Rotate last dmx data so we have the fourth element in pos 1. */ v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */ /* load TMD & TDD data */ v5 = vec_ld(n-4, p_tmd); /* del */ v7 = vec_ld(n-4, p_tdd); /* del */ /* calculate mmx+TMD */ v3 = vec_adds(v3,v5); /* del */ /* Start the ugly stuff. We have rotated last dmx data. Add TDD to * it and compare with v3/v4 (data from mmx+TMD alternative), but * we only compare & replace the first position, so we use a mask! */ /* First position: Add TDD to v_save_dmx */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ /* Select max of this and mmx+TMD and save it temporary to v_save_dmx */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ /* use a mask to select only the first element from v_save_dmx, rest from v3. */ v3 = vec_sel(v3,v_save_dmx,mask1); /* del */ /* Start doing the insertion calculations in parallel. * Load the TMI data. */ v9 = vec_ld(n , p_tmi); /* ins */ /* Deletion: * Now we have an accurate pos 1 in v3. continue with pos2 the same * way. Rotate to a temporary array, add to TDD, compare and use a mask * to write back to only position 2. */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask2); /* del */ /* More insertion stuff - load TII data */ v11 = vec_ld(n , p_tii); /* ins */ /* Deletion, position 3... */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask3); /* del */ /* insertion stuff: calculate mmx+TMI */ v9 = vec_adds(v_lmmx1,v9); /* ins */ /* Deletion, position 4 */ v_save_dmx = vec_sld(v3,v3,12); /* del */ v_save_dmx = vec_adds(v_save_dmx,v7); /* del */ v_save_dmx = vec_max(v_save_dmx,v3); /* del */ v3 = vec_sel(v3,v_save_dmx,mask4); /* del */ /* insertion stuff: calculate imx+TII */ v11 = vec_adds(v_limx1,v11); /* ins */ /* insertion stuff: select max of mmx+TMI and imx+TII */ v9 = vec_max(v9,v11); /* ins */ /* insertion stuff: load data from hmm->isc[tmpidx] */ v11 = vec_ld(n, p_isc); /* ins */ /* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */ v13 = (vector signed int)vec_cmpgt(v11,v_lowscore); v9 = vec_adds(v9,v11); v9 = vec_sel(v11,v9,(vector unsigned int)v13); /* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data; * save it to memory. (v_save_dmx will be used next iteration) */ vec_st(v3, n, dmxi); /* del */ /* save insertion too */ vec_st(v9, n, imxi); } /* end of k loops */ /* Now the special states. Order is important here. * remember, C and J emissions are zero score by definition, */ /* N state */ xmx[i][XMN] = -INFTY; if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY) xmx[i][XMN] = sc; /* E state */ v2 = vec_sld(max_mmxesc,max_mmxesc,8); v2 = vec_max(v2,max_mmxesc); v1 = vec_sld(v2,v2,4); v1 = vec_max(v1,v2); vec_ste(v1,XME*4,xmxi); /* J state */ xmxi[XMJ] = -INFTY; if ((sc = lxmxi[XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY) xmxi[XMJ] = sc; if ((sc = xmxi[XME] + hmm->xsc[XTE][LOOP]) > xmxi[XMJ]) xmxi[XMJ] = sc; /* B state */ xmxi[XMB] = -INFTY; if ((sc = xmxi[XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) xmxi[XMB] = sc; if ((sc = xmxi[XMJ] + hmm->xsc[XTJ][MOVE]) > xmxi[XMB]) xmxi[XMB] = sc; /* C state */ xmxi[XMC] = -INFTY; if ((sc = lxmxi[XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) xmxi[XMC] = sc; if ((sc = xmxi[XME] + hmm->xsc[XTE][MOVE]) > xmxi[XMC]) xmxi[XMC] = sc; } /* T state (not stored) */ sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE]; if (ret_tr != NULL) { P7ViterbiTrace(hmm, dsq, L, mx, &tr); *ret_tr = tr; } /* Note (Lindahl): we do NOT free the dpmatrix here anymore - the code was * spending 30% of the runtime allocating/freeing memory. * Provide a pointer to a dpmatrix_s structure to this routine, * and we try to reuse it. After the final call to P7Viterbi, * free it with FreePlan7Matrix. */ return Scorify(sc); /* the total Viterbi score. */}#endif /*the ALTIVEC port*/
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?