📄 atl_cncmmijk_c.c
字号:
r0mm_b1 = r1mm = Mjoin(Mjoin(Mjoin(NCmm0,NN),0x0x0),_a1_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NN),0x0x0_aX_bX); } else if (TB == AtlasConjTrans) { ai1 = ar0 = ATL_rnone; ai0 = ATL_rone; r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_aX_b0); r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b0); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b1); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_aX_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NT),0x0x0_aX_bX); } else { ar0 = ai0 = ai1 = ATL_rone; i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b0); i1mm = i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_b1); r0mm_b1 = r1mm = Mjoin(Mjoin(Mjoin(NCmm0,NT),0x0x0),_a1_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),NT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),NT),0x0x0_aX_bX); } } else if (TA == AtlasConjTrans) { if (TB == AtlasNoTrans) { ai0 = ar0 = ATL_rnone; ai1 = ATL_rone; r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_b0); r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_b0); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TN),0x0x0_aX_bX); } else if (TB == AtlasConjTrans) { ar0 = ATL_rone; ai1 = ai0 = ATL_rnone; r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } else /* TA == AtlasConjTrans, TB == AtlasTrans */ { ai0 = ar0 = ATL_rnone; ai1 = ATL_rone; r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } } else { if (TB == AtlasNoTrans) { ar0 = ai0 = ai1 = ATL_rone; i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b0); i1mm = i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_b1); r0mm_b1 = r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TN),0x0x0),_a1_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TN),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TN),0x0x0_aX_bX); } else if (TB == AtlasConjTrans) { ai1 = ar0 = ATL_rnone; ai0 = ATL_rone; r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_b0); r0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); i0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); i1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_aX_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } else { ar0 = ai0 = ai1 = ATL_rone; i0mm_bX = r0mm_bX = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b0); i1mm = i0mm_b1 = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_b1); r0mm_b1 = r1mm = Mjoin(Mjoin(Mjoin(NCmm0,TT),0x0x0),_a1_bX); mm_fixedKcu=Mjoin(Mjoin(Mjoin(NCmm00,Mjoin(0x0x,KB)),TT),0x0x0_aX_bX); mmcu = Mjoin(Mjoin(Mjoin(NCmm00,0x0x0),TT),0x0x0_aX_bX); } } if (TA == AtlasNoTrans) { incAk = lda * (KB<<1); incAn = -Kb * incAk; incAm = MB<<1; } else { incAk = KB<<1; incAn = -Kb * incAk; incAm = (MB<<1) * lda; } if (TB == AtlasNoTrans) { incBk = KB<<1; incBn = (ldb*NB - K + kr)<<1; incBm = -((Nb * ldb * NB)<<1); } else { incBk = (KB<<1)*ldb; incBn = (NB<<1) - Kb*incBk; incBm = -Nb*(NB<<1); } if (AlphaIsOne) { if (BetaIsOne) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b1); else if (BetaIsZero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_b0); else if (BetaIsReal) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_bXi0); else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_a1),_bX); } else if (AlphaIsReal) { if (BetaIsOne) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aXi0),_b1); else if (BetaIsZero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aXi0),_b0); else if (BetaIsReal) geadd=Mjoin(Mjoin(Mjoin(PATL,geadd),_aXi0),_bXi0); else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aXi0),_bX); } else if (BetaIsOne) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b1); else if (BetaIsZero) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_b0); else if (BetaIsReal) geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_bXi0); else geadd = Mjoin(Mjoin(Mjoin(PATL,geadd),_aX),_bX); vp = malloc(ATL_Cachelen + ATL_MulBySize(MB * NB)); ATL_assert(vp); cp = ATL_AlignPtr(vp); if (mr || nr || kr) Mjoin(PATL,gezero)(MB, NB, cp, MB); for (i=Mb; i; i--, a += incAm, b += incBm, c += incCm) { for (j=Nb; j; j--, a += incAn, b += incBn, c += incCn) { if (Kb) { r0mm_bX(MB, NB, KB, ar0, a+1, lda, b+1, ldb, ATL_rzero, cp, MB); i0mm_bX(MB, NB, KB, ai0, a+1, lda, b, ldb, ATL_rzero, cp+1, MB); r1mm(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); i1mm(MB, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) { r0mm_b1(MB, NB, KB, ar0, a+1, lda, b+1, ldb, ATL_rnone, cp, MB); i0mm_b1(MB, NB, KB, ai0, a+1, lda, b, ldb, ATL_rone, cp+1, MB); r1mm(MB, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); i1mm(MB, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } if (kr) { mmcu(MB, NB, kr, ar0, a+1, lda, b+1, ldb, ATL_rnone, cp, MB); mmcu(MB, NB, kr, ai0, a+1, lda, b, ldb, ATL_rone, cp+1, MB); mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mmcu(MB, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } } else if (kr) { Mjoin(PATL,zero)(MB*NB, cp, 1); /* kill NaN/INF from before */ mmcu(MB, NB, kr, ar0, a+1, lda, b+1, ldb, ATL_rzero, cp, MB); mmcu(MB, NB, kr, ai0, a+1, lda, b, ldb, ATL_rzero, cp+1, MB); mmcu(MB, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mmcu(MB, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } geadd(MB, NB, alpha, cp, MB, beta, c, ldc); } } if (mr) /* M-loop remainder */ { for (j=Nb; j; j--, a += incAn, b += incBn, c += incCn) { Mjoin(PATL,zero)(MB*NB, cp, 1); /* kill NaN/INF from before */ if (Kb) { mm_fixedKcu(mr, NB, KB, ar0, a+1, lda, b+1, ldb, ATL_rzero, cp, MB); mm_fixedKcu(mr, NB, KB, ai0, a+1, lda, b, ldb, ATL_rzero, cp+1, MB); mm_fixedKcu(mr, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mm_fixedKcu(mr, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); a += incAk; b += incBk; for (k=Kb-1; k; k--, a += incAk, b += incBk) { mm_fixedKcu(mr, NB, KB, ar0, a+1, lda, b+1, ldb, ATL_rnone, cp, MB); mm_fixedKcu(mr, NB, KB, ai0, a+1, lda, b, ldb, ATL_rone, cp+1, MB); mm_fixedKcu(mr, NB, KB, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mm_fixedKcu(mr, NB, KB, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } if (kr) { mmcu(mr, NB, kr, ar0, a+1, lda, b+1, ldb, ATL_rnone, cp, MB); mmcu(mr, NB, kr, ai0, a+1, lda, b, ldb, ATL_rone, cp+1, MB); mmcu(mr, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mmcu(mr, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } } else if (kr) { mmcu(mr, NB, kr, ar0, a+1, lda, b+1, ldb, ATL_rzero, cp, MB); mmcu(mr, NB, kr, ai0, a+1, lda, b, ldb, ATL_rzero, cp+1, MB); mmcu(mr, NB, kr, ATL_rone, a, lda, b, ldb, ATL_rnone, cp, MB); mmcu(mr, NB, kr, ai1, a, lda, b+1, ldb, ATL_rone, cp+1, MB); } geadd(mr, NB, alpha, cp, MB, beta, c, ldc); } } if (nr) Mjoin(PATL,NCmmJIK)(TA, TB, M, nr, K, alpha, A, lda, B+Nb*(incBn+Kb*incBk), ldb, beta, C+Nb*(NB<<1)*ldc, ldc); free(vp); return(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -