📄 atl_cncmmjik.c
字号:
i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_bX); } else { i0mm_bX=r1mm=r0mm_bX=Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1); } } else { r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); if (BetaIsOne) { r1mm = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); i1mm = i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); } else if (BetaIsZero) { i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_b0); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); } else { i0mm_bX=r1mm=r0mm_bX=Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); } } mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TN),0x0x0_aX_bX); } else if (TB == AtlasConjTrans) { ai1 = ar0 = -ralpha; ai0 = ralpha; if (AlphaIsOne) { r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); if (BetaIsOne) { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else if (BetaIsZero) { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } } else { r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); if (BetaIsOne) { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else if (BetaIsZero) { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else { r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } } mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } else { ar0 = ai0 = ai1 = ralpha; if (AlphaIsOne) { r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); if (BetaIsOne) { r1mm = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i1mm = i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); } else if (BetaIsZero) { i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); } else { i0mm_bX=r1mm=r0mm_bX=Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); } } else { r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); if (BetaIsOne) { r1mm = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else if (BetaIsZero) { i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } else { i0mm_bX=r1mm=r0mm_bX=Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); } } mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } } if (TA == AtlasNoTrans) { incAk = lda * (KB<<1); incAm = (MB<<1) - Kb * incAk; incAn = -Mb * (MB<<1); } else { incAk = KB<<1; incAm = (lda*MB - Kb*KB)<<1; incAn = -lda*(MB<<1)*Mb; } if (TB == AtlasNoTrans) { incBk = KB<<1; incBm = -(KB<<1)*Kb; incBn = ldb*(NB<<1); } else { incBk = (KB<<1)*ldb; incBm = -Kb * incBk; incBn = NB<<1; } for (j=Nb; j; j--, a += incAn, b += incBn, c += incCn) { for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm) { if (Kb) { r0mm_bX(MB, NB, KB, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); i0mm_bX(MB, NB, KB, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); r1mm(MB, NB, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); i1mm(MB, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) { r0mm_b1(MB, NB, KB, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); i0mm_b1(MB, NB, KB, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); r1mm(MB, NB, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); i1mm(MB, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } if (kr) { mmcu(MB, NB, kr, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); mmcu(MB, NB, kr, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); mmcu(MB, NB, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(MB, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } else if (kr) { if (BetaIsZero) Mjoin(PATL,gezero)(MB, NB, c, ldc); mmcu(MB, NB, kr, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); mmcu(MB, NB, kr, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); mmcu(MB, NB, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(MB, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } } if (mr && N != nr) Mjoin(PATL,NCmmIJK)(TA, TB, mr, N-nr, K, alpha, A+Mb*(incAm+Kb*incAk), lda, B, ldb, beta, C+Mb*(MB<<1), ldc); if (nr) { for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm) { if (BetaIsZero) Mjoin(PATL,gezero)(MB, nr, c, ldc); if (Kb) { mm_fixedKcu(MB, nr, KB, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); mm_fixedKcu(MB, nr, KB, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); mm_fixedKcu(MB, nr, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mm_fixedKcu(MB, nr, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) { mm_fixedKcu(MB, nr, KB, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); mm_fixedKcu(MB, nr, KB, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); mm_fixedKcu(MB, nr, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mm_fixedKcu(MB, nr, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } if (kr) { mmcu(MB, nr, kr, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); mmcu(MB, nr, kr, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); mmcu(MB, nr, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(MB, nr, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } else if (kr) { mmcu(MB, nr, kr, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); mmcu(MB, nr, kr, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); mmcu(MB, nr, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(MB, nr, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } if (mr) /* cleanup small mr x nr block of C */ { c = C + ((Mb*MB + ldc*Nb*NB)<<1); a = A + Mb*(incAm+Kb*incAk); b = B + Nb*( incBn+(Mb*(incBm+Kb*incBk)) ); if (BetaIsZero) Mjoin(PATL,gezero)(mr, nr, c, ldc); if (Kb) { mm_fixedKcu(mr, nr, KB, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); mm_fixedKcu(mr, nr, KB, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); mm_fixedKcu(mr, nr, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mm_fixedKcu(mr, nr, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) { mm_fixedKcu(mr, nr, KB, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); mm_fixedKcu(mr, nr, KB, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); mm_fixedKcu(mr, nr, KB, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mm_fixedKcu(mr, nr, KB, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } if (kr) { mmcu(mr, nr, kr, ar0, a+1, lda, b+1, ldb, ATL_rnone, c, ldc); mmcu(mr, nr, kr, ai0, a+1, lda, b, ldb, ATL_rone, c+1, ldc); mmcu(mr, nr, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(mr, nr, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } else if (kr) { mmcu(mr, nr, kr, ar0, a+1, lda, b+1, ldb, nrbeta, c, ldc); mmcu(mr, nr, kr, ai0, a+1, lda, b, ldb, rbeta, c+1, ldc); mmcu(mr, nr, kr, ralpha, a, lda, b, ldb, ATL_rnone, c, ldc); mmcu(mr, nr, kr, ai1, a, lda, b+1, ldb, ATL_rone, c+1, ldc); } } } return(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -