📄 atl_cmmjitcp.c
字号:
} } if (B) { if (n > N) { Mjoin(PATLU,gezero)(kr, n-N, pB+kr*N, kr); Mjoin(PATLU,gezero)(kr, n-N, pB+ipb+kr*N, kr); } if (kr != KR) { Mjoin(PATLU,gezero)(kr-KR, n, pB+KR, kr); Mjoin(PATLU,gezero)(kr-KR, n, pB+ipb+KR, kr); } B2blk(KR, N, one, B, ldb, pB+ipb, kr, pB, kr); } if (A) { if (m > M) { Mjoin(PATLU,gezero)(kr, m-M, pA+kr*M, kr); Mjoin(PATLU,gezero)(kr, m-M, pA+ipa+kr*M, kr); } if (kr != KR) { Mjoin(PATLU,gezero)(kr-KR, n, pA+KR, kr); Mjoin(PATLU,gezero)(kr-KR, n, pA+ipa+KR, kr); } A2blk(KR, M, one, A, lda, pA+ipa, kr, pA, kr); } if (nblk) { NBmmX(m, n, kr, ATL_rone, pA, kr, pB, kr, ATL_rnone, pC, ldpc); NBmm1(m, n, kr, ATL_rone, pA, kr, pB+ipb, kr, ATL_rone, pC+ipc, ldpc); } else { NBmm0(m, n, kr, ATL_rone, pA, kr, pB, kr, ATL_rzero, pC, ldpc); NBmm0(m, n, kr, ATL_rone, pA, kr, pB+ipb, kr, ATL_rzero, pC+ipc, ldpc); } NBmmX(m, n, kr, ATL_rone, pA+ipa, kr, pB+ipb, kr, ATL_rnone, pC, ldpc); NBmm1(m, n, kr, ATL_rone, pA+ipa, kr, pB, kr, ATL_rone, pC+ipc, ldpc); } Mjoin(PATL,gereal2cplx)(M, N, alpha, pC, ldpc, pC+ipc, ldpc, beta, C, ldc);}static int mmNMK (const int M, const int N, const int K, const int cnmblks, const int cnnblks, const int cnkblks, const int nkblks, const int kr, const TYPE *alpha, const TYPE *A, const int lda, int incAk, int incAW, const TYPE *B, const int ldb, int incBk, int incBW, const TYPE *beta, TYPE *C, const int ldc, MAT2BLK3 A2blk, MAT2BLK3 B2blk){ int incAm, incAn, incBn, incW, incCn, i, j, mb, nb; void *vp=NULL; const TYPE *b; TYPE *pA, *pB, *pC; incAm = (incAk == KB*2 ? MB*(lda+lda) : 2*MB); incBn = (incBk == KB*2 ? NB*(ldb+ldb) : 2*NB); incW = (incAW ? 2*MB*cnkblks*KB : 0); incCn = NB*(ldc+ldc) - cnmblks*(MB+MB); incAn = -cnmblks * (incAW ? incW : incAm); i = incAW ? 2*cnmblks*MB*cnkblks*KB : 2*MB*KB; /* wrk for A */ i += incBW ? 2*cnkblks*KB*NB : 2*NB*KB; i += 2*MB*NB; i *= sizeof(TYPE); if (i <= ATL_MaxMalloc) vp = malloc(ATL_Cachelen+i); if (!vp) return(-1); pC = ATL_AlignPtr(vp); pB = pC + 2*MB*NB; pA = pB + (incBW ? 2*cnkblks*KB*NB : 2*NB*KB); for (j=0; j < N; j += NB, B += incBn) { b = B; nb = N - j; nb = Mmin(NB, nb); for (i=0; i < M; i += MB, A += incAm) { mb = M - i; mb = Mmin(MB, mb); Mjoin(PATL,mmK)(mb, nb, nkblks, kr, A, lda, incAk, alpha, pA, incAW, b, ldb, incBk, pB, incBW, beta, C, ldc, pC, MB, A2blk, B2blk); pA += incW; b = incBW ? NULL : b; /* reuse col-panel of B if copied */ C += MB+MB; } if (incAW) /* we have copied all of A, just reuse now */ { A = NULL; incAm = incAk = 0; pA += incAn; } else A += incAn; C += incCn; } free(vp); return(0);}static int mmMNK (const int M, const int N, const int K, const int cnmblks, const int cnnblks, const int cnkblks, const int nkblks, const int kr, const TYPE *alpha, const TYPE *A, const int lda, int incAk, int incAW, const TYPE *B, const int ldb, int incBk, int incBW, const TYPE *beta, TYPE *C, const int ldc, MAT2BLK3 A2blk, MAT2BLK3 B2blk){ int incW, incAm, incBn, incBm, incCn, incCm; int i, j, mb, nb; void *vp=NULL; const TYPE *a; TYPE *pA, *pB, *pC; incW = (incBW ? 2*NB*cnkblks*KB : 0); incAm = (incAk == 2*KB ? 2*MB*lda : 2*MB); incCn = (ldc+ldc)*NB; incCm = MB+MB - cnnblks*incCn; incBn = (incBk == 2*KB ? 2*NB*ldb : 2*NB); incBm = -cnnblks * (incBW ? incW : incBn); i = incAW ? 2*cnkblks*KB*MB : 2*MB*KB; /* wrk for A */ i += incBW ? 2*cnnblks*cnkblks*KB*NB : 2*NB*KB; /* wrk for B */ i += 2*MB*NB; /* wrk for C */ i *= sizeof(TYPE); if (i <= ATL_MaxMalloc) vp = malloc(ATL_Cachelen+i); if (!vp) return(-1); pC = ATL_AlignPtr(vp); pA = pC + 2*MB*NB; pB = pA + (incAW ? 2*MB*cnkblks*KB : 2*MB*KB); for (i=0; i < M; i += MB, A += incAm) { a = A; mb = M - i; mb = Mmin(MB, mb); for (j=0; j < N; j += NB) { nb = N - j; nb = Mmin(NB, nb); Mjoin(PATL,mmK)(mb, nb, nkblks, kr, a, lda, incAk, alpha, pA, incAW, B, ldb, incBk, pB, incBW, beta, C, ldc, pC, MB, A2blk, B2blk); B += incBn; pB += incW; a = incAW ? NULL : a; /* reuse row-panel of A if copied */ C += incCn; } if (incBW) /* we have copied all of B, just reuse now */ { B = NULL; incBn = incBk = 0; pB += incBm; } else B += incBm; C += incCm; } free(vp); return(0);}int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M0, const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc)/* * This routine copies A & B just before using them, which gets better * cache reuse when the copy cost is dominant (K dominates M & N). Normally, * (lots of reuse in algorithm) the extra cache noise makes this a bad idea. */{ int incAk, incBk, incAW, incBW, incAB, incC; int i, j, m, n; const int M = (M0 >= 0) ? M0 : -M0; const int nkblks = (K/KB), kr = K-nkblks*KB; const int cnkblks=(K+KB-1)/KB, cnmblks=(M+MB-1)/MB, cnnblks=(N+NB-1)/NB; MAT2BLK3 A2blk, B2blk; if (M0 > 0) /* normally, only copy all of matrix if reuse is possible */ { incAW = (N > NB) ? KB*MB*2 : 0; incBW = (M > NB) ? KB*NB*2 : 0; } else /* M0 < 0 flag to use minimal workspace */ incAW = incBW = 0; if (TA == AtlasNoTrans) { incAk = lda*2*KB; A2blk = ATL_gecplx2realT_a1; } else if (TA == AtlasConjTrans) { incAk = KB*2; A2blk = ATL_gecplx2realConj_a1; } else { incAk = KB*2; A2blk = ATL_gecplx2real_a1; } if (TB == AtlasNoTrans) { incBk = KB*2; B2blk = ATL_gecplx2real_a1; } else if (TB == AtlasConjTrans) { incBk = ldb*2*KB; B2blk = ATL_gecplx2realC_a1; } else { incBk = ldb*2*KB; B2blk = ATL_gecplx2realT_a1; }/* * If A isn't copied, or is smaller than B, copy it as inner matrix */ if (M <= N || incAW) { if (mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) { if (!incAW) return(-1); m = cnmblks; i = cnmblks >> 1;/* * Keep trying half-size M until we can copy all of A subsection */ j = 0; /* # of M blocks to do undefined */ while(i > 2) { i += m - i - i; /* # of blks to try at a whack */ if (!mmNMK(i*MB, N, K, i, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) { incAB = (TA == AtlasNoTrans) ? 2*MB*i : 2*MB*lda*i; incC = 2*MB*i; j = i; break; } m = i; i >>= 1; } if (j) /* we've handled first J rows of A */ { for (i=j; i < cnmblks; i += j) { A += incAB; C += incC; if (i+j < cnmblks) m = j * MB; else { m = M - i*MB; j = cnmblks - i; } if (mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) if (mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, 0, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) ATL_assert(!mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, 0, B, ldb, incBk, 0, beta, C, ldc, A2blk, B2blk)); } } else /* just try not copying A at all */ return(mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, 0, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)); } }/* * If A is copied and is larger than B, copy B as inner matrix */ else if (mmMNK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) { if (!incBW) return(-1); n = cnnblks; i = cnnblks >> 1;/* * Keep trying half-size M until we can copy all of A subsection */ j = 0; /* # of M blocks to do undefined */ while(i > 2) { i += n - i - i; /* # of blks to try at a whack */ if (!mmMNK(M, i*NB, K, cnmblks, i, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) { incAB = (TB == AtlasNoTrans) ? 2*NB*ldb*i : 2*NB*i; incC = 2*NB*i*ldc; j = i; break; } n = i; i >>= 1; } if (j) /* we've handled first J cols of B */ { for (i=j; i < cnnblks; i += j) { B += incAB; C += incC; if (i+j < cnnblks) n = j * MB; else { n = N - i*NB; j = cnnblks - i; } if (mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)) if (mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr, alpha, A, lda, incAk, incAW, B, ldb, incBk, 0, beta, C, ldc, A2blk, B2blk)) ATL_assert(!mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr, alpha, A, lda, incAk, 0, B, ldb, incBk, 0, beta, C, ldc, A2blk, B2blk)); } } else /* just try not copying A at all */ return(mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha, A, lda, incAk, 0, B, ldb, incBk, incBW, beta, C, ldc, A2blk, B2blk)); } return(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -