⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_cmmjitcp.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 2 页
字号:
         }      }      if (B)      {         if (n > N)         {            Mjoin(PATLU,gezero)(kr, n-N, pB+kr*N, kr);            Mjoin(PATLU,gezero)(kr, n-N, pB+ipb+kr*N, kr);         }         if (kr != KR)         {            Mjoin(PATLU,gezero)(kr-KR, n, pB+KR, kr);            Mjoin(PATLU,gezero)(kr-KR, n, pB+ipb+KR, kr);         }         B2blk(KR, N, one, B, ldb, pB+ipb, kr, pB, kr);      }      if (A)      {         if (m > M)         {            Mjoin(PATLU,gezero)(kr, m-M, pA+kr*M, kr);            Mjoin(PATLU,gezero)(kr, m-M, pA+ipa+kr*M, kr);         }         if (kr != KR)         {            Mjoin(PATLU,gezero)(kr-KR, n, pA+KR, kr);            Mjoin(PATLU,gezero)(kr-KR, n, pA+ipa+KR, kr);         }         A2blk(KR, M, one, A, lda, pA+ipa, kr, pA, kr);      }      if (nblk)      {         NBmmX(m, n, kr, ATL_rone, pA, kr, pB, kr, ATL_rnone, pC, ldpc);         NBmm1(m, n, kr, ATL_rone, pA, kr, pB+ipb, kr, ATL_rone, pC+ipc, ldpc);      }      else      {         NBmm0(m, n, kr, ATL_rone, pA, kr, pB, kr, ATL_rzero, pC, ldpc);         NBmm0(m, n, kr, ATL_rone, pA, kr, pB+ipb, kr, ATL_rzero, pC+ipc, ldpc);      }      NBmmX(m, n, kr, ATL_rone, pA+ipa, kr, pB+ipb, kr, ATL_rnone, pC, ldpc);      NBmm1(m, n, kr, ATL_rone, pA+ipa, kr, pB, kr, ATL_rone, pC+ipc, ldpc);   }   Mjoin(PATL,gereal2cplx)(M, N, alpha, pC, ldpc, pC+ipc, ldpc, beta, C, ldc);}static int mmNMK   (const int M, const int N, const int K, const int cnmblks, const int cnnblks,    const int cnkblks, const int nkblks, const int kr, const TYPE *alpha,    const TYPE *A, const int lda, int incAk, int incAW,    const TYPE *B, const int ldb, int incBk, int incBW,    const TYPE *beta, TYPE *C, const int ldc, MAT2BLK3 A2blk, MAT2BLK3 B2blk){   int incAm, incAn, incBn, incW, incCn, i, j, mb, nb;   void *vp=NULL;   const TYPE *b;   TYPE *pA, *pB, *pC;   incAm = (incAk == KB*2 ? MB*(lda+lda) : 2*MB);   incBn = (incBk == KB*2 ? NB*(ldb+ldb) : 2*NB);   incW = (incAW ? 2*MB*cnkblks*KB : 0);   incCn = NB*(ldc+ldc) - cnmblks*(MB+MB);   incAn = -cnmblks * (incAW ? incW : incAm);   i = incAW ? 2*cnmblks*MB*cnkblks*KB : 2*MB*KB; /* wrk for A */   i += incBW ? 2*cnkblks*KB*NB : 2*NB*KB;   i += 2*MB*NB;   i *= sizeof(TYPE);   if (i <= ATL_MaxMalloc)      vp = malloc(ATL_Cachelen+i);   if (!vp) return(-1);   pC = ATL_AlignPtr(vp);   pB = pC + 2*MB*NB;   pA = pB + (incBW ? 2*cnkblks*KB*NB : 2*NB*KB);   for (j=0; j < N; j += NB, B += incBn)   {      b = B;      nb = N - j;      nb = Mmin(NB, nb);      for (i=0; i < M; i += MB, A += incAm)      {         mb = M - i;         mb = Mmin(MB, mb);         Mjoin(PATL,mmK)(mb, nb, nkblks, kr, A, lda, incAk, alpha, pA, incAW,                         b, ldb, incBk, pB, incBW, beta, C, ldc, pC, MB,                         A2blk, B2blk);         pA += incW;         b = incBW ? NULL : b;  /* reuse col-panel of B if copied */         C += MB+MB;      }      if (incAW)  /* we have copied all of A, just reuse now */      {         A = NULL;         incAm = incAk = 0;         pA += incAn;      }      else         A += incAn;      C += incCn;   }   free(vp);   return(0);}static int mmMNK   (const int M, const int N, const int K, const int cnmblks, const int cnnblks,    const int cnkblks, const int nkblks, const int kr, const TYPE *alpha,    const TYPE *A, const int lda, int incAk, int incAW,    const TYPE *B, const int ldb, int incBk, int incBW,    const TYPE *beta, TYPE *C, const int ldc, MAT2BLK3 A2blk, MAT2BLK3 B2blk){   int incW, incAm, incBn, incBm, incCn, incCm;   int i, j, mb, nb;   void *vp=NULL;   const TYPE *a;   TYPE *pA, *pB, *pC;   incW = (incBW ? 2*NB*cnkblks*KB : 0);   incAm = (incAk == 2*KB ? 2*MB*lda : 2*MB);   incCn = (ldc+ldc)*NB;   incCm = MB+MB - cnnblks*incCn;   incBn = (incBk == 2*KB ? 2*NB*ldb : 2*NB);   incBm = -cnnblks * (incBW ? incW : incBn);   i = incAW ? 2*cnkblks*KB*MB : 2*MB*KB;           /* wrk for A */   i += incBW ? 2*cnnblks*cnkblks*KB*NB : 2*NB*KB;  /* wrk for B */   i += 2*MB*NB;                                    /* wrk for C */   i *= sizeof(TYPE);   if (i <= ATL_MaxMalloc)      vp = malloc(ATL_Cachelen+i);   if (!vp) return(-1);   pC = ATL_AlignPtr(vp);   pA = pC + 2*MB*NB;   pB = pA + (incAW ? 2*MB*cnkblks*KB : 2*MB*KB);   for (i=0; i < M; i += MB, A += incAm)   {      a = A;      mb = M - i;      mb = Mmin(MB, mb);      for (j=0; j < N; j += NB)      {         nb = N - j;         nb = Mmin(NB, nb);         Mjoin(PATL,mmK)(mb, nb, nkblks, kr, a, lda, incAk, alpha, pA, incAW,                         B, ldb, incBk, pB, incBW, beta, C, ldc, pC, MB,                         A2blk, B2blk);         B += incBn;         pB += incW;         a = incAW ? NULL : a;  /* reuse row-panel of A if copied */         C += incCn;      }      if (incBW)  /* we have copied all of B, just reuse now */      {         B = NULL;         incBn = incBk = 0;         pB += incBm;      }      else         B += incBm;      C += incCm;   }   free(vp);   return(0);}int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,                        const int M0, const int N, const int K,                        const SCALAR alpha, const TYPE *A, const int lda,                        const TYPE *B, const int ldb, const SCALAR beta,                        TYPE *C, const int ldc)/* * This routine copies A & B just before using them, which gets better * cache reuse when the copy cost is dominant (K dominates M & N).  Normally, * (lots of reuse in algorithm) the extra cache noise makes this a bad idea. */{   int incAk, incBk, incAW, incBW, incAB, incC;   int i, j, m, n;   const int M = (M0 >= 0) ? M0 : -M0;   const int nkblks = (K/KB), kr = K-nkblks*KB;   const int cnkblks=(K+KB-1)/KB, cnmblks=(M+MB-1)/MB, cnnblks=(N+NB-1)/NB;   MAT2BLK3 A2blk, B2blk;   if (M0 > 0)  /* normally, only copy all of matrix if reuse is possible */   {       incAW = (N > NB) ? KB*MB*2 : 0;       incBW = (M > NB) ? KB*NB*2 : 0;   }   else  /* M0 < 0 flag to use minimal workspace */      incAW = incBW = 0;   if (TA == AtlasNoTrans)   {      incAk = lda*2*KB;      A2blk = ATL_gecplx2realT_a1;   }   else if (TA == AtlasConjTrans)   {      incAk = KB*2;      A2blk = ATL_gecplx2realConj_a1;   }   else   {      incAk = KB*2;      A2blk = ATL_gecplx2real_a1;   }   if (TB == AtlasNoTrans)   {      incBk = KB*2;      B2blk = ATL_gecplx2real_a1;   }   else if (TB == AtlasConjTrans)   {      incBk = ldb*2*KB;      B2blk = ATL_gecplx2realC_a1;   }   else   {      incBk = ldb*2*KB;      B2blk = ATL_gecplx2realT_a1;   }/* * If A isn't copied, or is smaller than B, copy it as inner matrix */   if (M <= N || incAW)   {      if (mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha,                A, lda, incAk, incAW, B, ldb, incBk, incBW,                beta, C, ldc, A2blk, B2blk))      {         if (!incAW)            return(-1);         m = cnmblks;         i = cnmblks >> 1;/* *       Keep trying half-size M until we can copy all of A subsection */         j = 0;       /* # of M blocks to do undefined */         while(i > 2)         {            i += m - i - i;  /* # of blks to try at a whack */            if (!mmNMK(i*MB, N, K, i, cnnblks, cnkblks, nkblks, kr, alpha,                       A, lda, incAk, incAW, B, ldb, incBk, incBW,                       beta, C, ldc, A2blk, B2blk))            {               incAB = (TA == AtlasNoTrans) ? 2*MB*i : 2*MB*lda*i;               incC = 2*MB*i;               j = i;               break;            }            m = i;            i >>= 1;         }         if (j)  /* we've handled first J rows of A */         {            for (i=j; i < cnmblks; i += j)            {               A += incAB;               C += incC;               if (i+j < cnmblks)                  m = j * MB;               else               {                  m = M - i*MB;                  j = cnmblks - i;               }               if (mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr, alpha,                         A, lda, incAk, incAW, B, ldb, incBk, incBW,                         beta, C, ldc, A2blk, B2blk))                  if (mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr, alpha,                            A, lda, incAk, 0, B, ldb, incBk, incBW,                            beta, C, ldc, A2blk, B2blk))                     ATL_assert(!mmNMK(m, N, K, j, cnnblks, cnkblks, nkblks, kr,                                       alpha, A, lda, incAk, 0, B, ldb, incBk,                                       0, beta, C, ldc, A2blk, B2blk));            }         }         else  /* just try not copying A at all */            return(mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha,                         A, lda, incAk, 0, B, ldb, incBk, incBW,                         beta, C, ldc, A2blk, B2blk));      }   }/* * If A is copied and is larger than B, copy B as inner matrix */   else if (mmMNK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha,                  A, lda, incAk, incAW, B, ldb, incBk, incBW,                  beta, C, ldc, A2blk, B2blk))   {      if (!incBW)         return(-1);      n = cnnblks;      i = cnnblks >> 1;/* *    Keep trying half-size M until we can copy all of A subsection */      j = 0;       /* # of M blocks to do undefined */      while(i > 2)      {         i += n - i - i;  /* # of blks to try at a whack */         if (!mmMNK(M, i*NB, K, cnmblks, i, cnkblks, nkblks, kr, alpha,                    A, lda, incAk, incAW, B, ldb, incBk, incBW,                    beta, C, ldc, A2blk, B2blk))         {            incAB = (TB == AtlasNoTrans) ? 2*NB*ldb*i : 2*NB*i;            incC = 2*NB*i*ldc;            j = i;            break;         }         n = i;         i >>= 1;      }      if (j)  /* we've handled first J cols of B */      {         for (i=j; i < cnnblks; i += j)         {            B += incAB;            C += incC;            if (i+j < cnnblks)               n = j * MB;            else            {               n = N - i*NB;               j = cnnblks - i;            }            if (mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr, alpha,                      A, lda, incAk, incAW, B, ldb, incBk, incBW,                      beta, C, ldc, A2blk, B2blk))               if (mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr, alpha,                         A, lda, incAk, incAW, B, ldb, incBk, 0,                         beta, C, ldc, A2blk, B2blk))                  ATL_assert(!mmMNK(M, n, K, cnmblks, j, cnkblks, nkblks, kr,                                    alpha, A, lda, incAk, 0, B, ldb, incBk, 0,                                    beta, C, ldc, A2blk, B2blk));         }      }      else  /* just try not copying A at all */         return(mmNMK(M, N, K, cnmblks, cnnblks, cnkblks, nkblks, kr, alpha,                      A, lda, incAk, 0, B, ldb, incBk, incBW,                      beta, C, ldc, A2blk, B2blk));   }   return(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -