📄 atl_cmmjitcp.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#include "atlas_level3.h"#ifdef SCPLX   #include "smm.h"#else   #include "dmm.h"#endif#ifndef ATL_MaxMalloc   /* temp, defined in atlas_lvl3.h */   #define ATL_MaxMalloc 16777216#endiftypedef void (*MAT2BLK3)(const int, const int, const SCALAR, const TYPE*,                         const int, TYPE*, const int, TYPE*, const int);void Mjoin(PATLU,pNBmm_bX)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_bX)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_bX)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void NBmm_bX(const int M, const int N, const int K,             const TYPE alpha, const TYPE *A, const int lda,             const TYPE *B, const int ldb, const TYPE beta,             TYPE *C, const int ldc);void Mjoin(PATLU,pNBmm_b1)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_b1)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_b1)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void NBmm_b1(const int M, const int N, const int K,             const TYPE alpha, const TYPE *A, const int lda,             const TYPE *B, const int ldb, const TYPE beta,             TYPE *C, const int ldc);void Mjoin(PATLU,pNBmm_b0)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pMBmm_b0)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm_b0)(const int M, const int N, const int K,                          const TYPE alpha, const TYPE *A, const int lda,                          const TYPE *B, const int ldb, const TYPE beta,                          TYPE *C, const int ldc);void NBmm_b0(const int M, const int N, const int K,             const TYPE alpha, const TYPE *A, const int lda,             const TYPE *B, const int ldb, const TYPE beta,             TYPE *C, const int ldc);void Mjoin(PATLU,pKBmm)(const int M, const int N, const int K,                        const TYPE alpha, const TYPE *A, const int lda,                        const TYPE *B, const int ldb, const TYPE beta,                        TYPE *C, const int ldc);void Mjoin(PATL,row2blkT_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);void Mjoin(PATL,col2blk_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);void Mjoin(PATL,gereal2cplx)   (const int M, const int N, const TYPE *alpha, const TYPE *R, const int ldr,    const TYPE *I, const int ldi, const TYPE *beta, TYPE *C, const int ldc);static void ATL_gecplx2real_a1   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,    TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A into separate real arrays R/I. */{   const int incA = (lda-M)<<1;   int i, j;/* * Copy backwards so 1st part of matrix is LRU */   A += ((N-1)*lda+M-1)<<1;   pR += (N-1)*ldr;   pI += (N-1)*ldi;   for (j=N; j; j--, A -= incA, pR -= ldr, pI -= ldi)   {      for (i=M-1; i >= 0; i--, A -= 2)      {         pR[i] = *A;         pI[i] = A[1];      }   }}static void ATL_gecplx2realT_a1   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,    TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A' into separate real arrays R/I. * Output matrix is MxN, so A must be NxM */{   const int lda2 = (lda-N)<<1, incR = 1-N*ldr, incI = 1-N*ldi;   int i, j;/* * Loop over M cols of A */   for (i=M; i; i--, A += lda2, pR += incR, pI += incI)   {      for (j=N; j; j--, A += 2, pR += ldr, pI += ldi)      {         *pR = *A;         *pI = A[1];      }   }}static void ATL_gecplx2realConj_a1   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,    TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A into separate real arrays R/I. */{   const int incA = (lda-M)<<1;   int i, j;/* * Copy backwards so 1st part of matrix is LRU */   A += ((N-1)*lda+M-1)<<1;   pR += (N-1)*ldr;   pI += (N-1)*ldi;   for (j=N; j; j--, A -= incA, pR -= ldr, pI -= ldi)   {      for (i=M-1; i >= 0; i--, A -= 2)      {         pR[i] = *A;         pI[i] = -A[1];      }   }}static void ATL_gecplx2realC_a1   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,    TYPE *pR, const int ldr, TYPE *pI, const int ldi)/* * Splits real & imag components of A' into separate real arrays R/I. * Output matrix is MxN, so A must be NxM */{   const int lda2 = (lda-N)<<1, incR = 1-N*ldr, incI = 1-N*ldi;   int i, j;/* * Loop over M cols of A */   for (i=M; i; i--, A += lda2, pR += incR, pI += incI)   {      for (j=N; j; j--, A += 2, pR += ldr, pI += ldi)      {         *pR = *A;         *pI = -A[1];      }   }}static void Mjoin(PATL,mmK)   (int M,  /* true # of rows in row-panel, M <= MB */    int N,  /* true # of cols in col-panel, N < = NB */    int nblk, /* # of blocks in K dimension */    int KR,   /* KR = K - nKb*KB; */    const TYPE *A, /* array to copy from, NULL if already cp */    const int lda,  /* leading dimension of A */    const int incA, /* inc to next blk in A */    const TYPE *alpha,    TYPE *pA,       /* wrkspace to copy A to */    const int incAW, /* 0 : keep using same KBxMB space */    const TYPE *B, /* array to copy from, NULL if already cp */    const int ldb,  /* leading dimension of B */    const int incB, /* inc to next blk in B */    TYPE *pB,       /* wrkspace to copy B to */    const int incBW, /* 0 : keep using same KBxNB space */    const TYPE *beta,    TYPE *C,         /* output matrix */    const int ldc,    TYPE *pC,       /* ldpc x NB workspace */    const int ldpc,    MAT2BLK3 A2blk, /* rout to copy A */    MAT2BLK3 B2blk) /* rout to copy B *//* * Performs a K-inner-loop matmul, while copying A & B if necessary. * If M > m, we are doing extra flops so we don't call cleanup (same for N) * This just-in-time copy is better alg when K dim dominates M & N. */{   int m, n, kr;  /* # of row/cols to operate on, m >= M, n >= N */   int k;   const TYPE one[2] = {ATL_rone, ATL_rzero}, zero[2] = {ATL_rzero, ATL_rzero};/* * Indexes to next blk (i.e. real to imag) of matrices; always uses full * block stride, even when we have a partial block */   int ipb = NB*KB, ipa = MB*KB, ipc = ldpc*NB;   void (*NBmm0)(const int, const int, const int, const TYPE,                 const TYPE*, const int, const TYPE*, const int,                 const TYPE, TYPE*, const int);   void (*NBmm1)(const int, const int, const int, const TYPE,                 const TYPE*, const int, const TYPE*, const int,                 const TYPE, TYPE*, const int);   void (*NBmmX)(const int, const int, const int, const TYPE,                 const TYPE*, const int, const TYPE*, const int,                 const TYPE, TYPE*, const int);   m = (M < MB && M+ATL_mmMU >= MB) ? MB : M;   n = (N < NB && N+ATL_mmNU >= NB) ? NB : N;   if (m == MB && n == NB)   {      NBmm0 = NBmm_b0;      NBmm1 = NBmm_b1;      NBmmX = NBmm_bX;   }   else if (m == MB)  /* N cleanup needed */   {      NBmm0 = Mjoin(PATLU,pNBmm_b0);      NBmm1 = Mjoin(PATLU,pNBmm_b1);      NBmmX = Mjoin(PATLU,pNBmm_bX);   }   else if (n == NB) /* M cleanup needed */   {      NBmm0 = Mjoin(PATLU,pMBmm_b0);      NBmm1 = Mjoin(PATLU,pMBmm_b1);      NBmmX = Mjoin(PATLU,pMBmm_bX);   }   else  /* two or more dim < NB, requires generated cleanup */      NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);   if (nblk)   {      if (B)      {         if (n > N)         {            Mjoin(PATLU,gezero)(KB, n-N, pB+KB*N, KB);            Mjoin(PATLU,gezero)(KB, n-N, pB+ipb+KB*N, KB);         }         B2blk(KB, N, one, B, ldb, pB+ipb, KB, pB, KB);         B += incB;      }      if (A)      {         if (m > M)         {            Mjoin(PATLU,gezero)(KB, m-M, pA+KB*M, KB);            Mjoin(PATLU,gezero)(KB, m-M, pA+ipa+KB*M, KB);         }         A2blk(KB, M, one, A, lda, pA+ipa, KB, pA, KB);         A += incA;      }      NBmm0(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rzero, pC, ldpc);      NBmm0(m, n, KB, ATL_rone, pA, KB, pB+ipb, KB, ATL_rzero, pC+ipc, ldpc);      NBmmX(m, n, KB, ATL_rone, pA+ipa, KB, pB+ipb, KB, ATL_rnone, pC, ldpc);      NBmm1(m, n, KB, ATL_rone, pA+ipa, KB, pB, KB, ATL_rone, pC+ipc, ldpc);      pA += incAW; pB += incBW;      for (k=nblk-1; k; k--)      {         if (B)         {            if (n > N)            {               Mjoin(PATLU,gezero)(KB, n-N, pB+KB*N, KB);               Mjoin(PATLU,gezero)(KB, n-N, pB+ipb+KB*N, KB);            }            B2blk(KB, N, one, B, ldb, pB+ipb, KB, pB, KB);            B += incB;         }         if (A)         {            if (m > M)            {               Mjoin(PATLU,gezero)(KB, m-M, pA+KB*M, KB);               Mjoin(PATLU,gezero)(KB, m-M, pA+ipa+KB*M, KB);            }            A2blk(KB, M, one, A, lda, pA+ipa, KB, pA, KB);            A += incA;         }         NBmmX(m, n, KB, ATL_rone, pA, KB, pB, KB, ATL_rnone, pC, ldpc);         NBmm1(m, n, KB, ATL_rone, pA, KB, pB+ipb, KB, ATL_rone, pC+ipc, ldpc);         NBmmX(m, n, KB, ATL_rone, pA+ipa, KB, pB+ipb, KB, ATL_rnone, pC, ldpc);         NBmm1(m, n, KB, ATL_rone, pA+ipa, KB, pB, KB, ATL_rone, pC+ipc, ldpc);         pA += incAW; pB += incBW;      }   }   if (KR)  /* need to cleanup K-loop */   {      if (KR+4 >= KB)  /* do extra flops to avoid cleanup loop */         kr = KB;      else  /* must use K cleanup */      {         kr = KR;         if (m < MB || n < NB) /* use general K cleanup */         {            n = N; m = M;            if (!nblk)            {               Mjoin(PATLU,gezero)(M, N, pC, ldpc);               Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc);            }            NBmm1 = NBmmX = NBmm0 = Mjoin(PATLU,pKBmm);         }         else /* use K-only cleanup */         {            NBmm0 = Mjoin(PATLU,pKBmm_b0);            NBmm1 = Mjoin(PATLU,pKBmm_b1);            NBmmX = Mjoin(PATLU,pKBmm_bX);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -