⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_mm4x4x2us_mn.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 2 页
字号:
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2000 R. Clint Whaley * * Code contributers : R. Clint Whaley, Viet Nguyen and Peter Strazdins * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * *//* * NOTE: This is a direct adaption of Viet Nguyen's and Peter Strazdin's *       ATL_mm4x4x2US.c code for fixed NB.  I'm not sure who should hold *       the copyright in such a case; essentially, I did the typing, but it *       completely uses their design for the inner kernel.  This file handles *       M or N loop cleanup of arbitrary dimension. */#include <atlas_misc.h>#if !defined(KB) || KB == 0 || (KB/2)*2 != KB   #error KB must be nonzero multiple of 2#endif#if !defined(NB) || NB == 0   #define CLEANING_N#elif ( (NB/4)*4 != NB )   #error NB must be multiple of 4#endif#if !defined(MB) || MB == 0   #define CLEANING_M#elif ( (MB/4)*4 != MB )   #error MB must be multiple of 4#endif#if defined(CLEANING_M) && defined(CLEANING_N)   #error One of MB and NB must be defined#endif#if defined(CLEANING_M) || defined(CLEANING_N)static void ATL_mm1x1x1   (const int M, const int N, const int K, const TYPE alpha,    const TYPE *A, const int lda, const TYPE *B, const int ldb,    const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=1, nu=1, ku=1 */{   #define Mb M   #define Nb N   #define Kb K   const TYPE *stM = A + (lda*Mb);   const TYPE *stN = B + (ldb*Nb);   const int incAm = ((lda) - Kb), incAn = -(Mb*lda);   const int incBm = -(Kb), incBn = (ldb);   #ifdef TREAL      #define incCm 1      const int incCn = (ldc) - (Mb);   #else      #define incCm 2      const int incCn = (ldc - Mb)<<1;   #endif   TYPE *pC0=C;   const TYPE *pA0=A;   const TYPE *pB0=B;   register int k;   register TYPE rA0;   register TYPE rB0;   register TYPE rC0_0;   do /* N-loop */   {      do /* M-loop */      {         #ifdef BETA0            rC0_0 = ATL_rzero;         #elif defined(BETA1)            rC0_0 = *pC0;         #else            rC0_0 = *pC0 * beta;         #endif         for (k=K; k; k--) /* easy loop to unroll */         {            rA0 = *pA0++;            rB0 = *pB0++;            rC0_0 += rA0 * rB0;         }         *pC0 = rC0_0;         pC0 += incCm;         pA0 += incAm;         pB0 += incBm;      }      while(pA0 != stM);      pC0 += incCn;      pA0 += incAn;      pB0 += incBn;   }   while(pB0 != stN);}#ifdef incCm   #undef incCm#endif#ifdef Mb   #undef Mb#endif#ifdef Nb   #undef Nb#endif#ifdef Kb   #undef Kb#endif#endif#ifdef CLEANING_Mstatic void ATL_mm1x4x1   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=1, nu=4, ku=1 */{   #define Mb M   const int Nb = (N>>2)<<2;   #define Kb K   const int Kloop = K - 2;   const TYPE *ca=A, *cb=B;   TYPE *cc=C;   const TYPE *stM = A + (lda*Mb);   const TYPE *stN = B + (ldb*Nb);   #define incAk 1   const int incAm = ((lda) - Kb), incAn = -(Mb*lda);   #define incBk 1   const int incBm = -(Kb), incBn = (((ldb) << 2));   #define incAk0 incAk   #define incBk0 incBk   #ifdef TREAL      #define incCm 1      #define ldc2 ldc      const int incCn = (((ldc) << 2)) - (Mb);   #else      #define incCm 2      const int incCn = ((((ldc) << 2)) - (Mb))<<1, ldc2=ldc<<1;   #endif   TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2);   const TYPE *pA0=A;   const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb);   register int k;   register TYPE rA0;   register TYPE rB0, rB1, rB2, rB3;   register TYPE m0, m1, m2, m3;   register TYPE rC0_0, rC0_1, rC0_2, rC0_3;   if (K < 3)   {      ATL_mm1x1x1(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);      return;   }   if (pB0 != stN)   {      do /* N-loop */      {         do /* M-loop */         {            #ifdef BETA0               rC0_0 = rC0_1 = rC0_2 = rC0_3 = ATL_rzero;            #else               rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3;               #ifdef BETAX                  rB3 = beta;                  rC0_0 *= rB3; rC0_1 *= rB3; rC0_2 *= rB3; rC0_3 *= rB3;               #endif            #endif/* *          Start pipeline */            rA0 = *pA0;            rB0 = *pB0;            rB1 = *pB1;            rB2 = *pB2;            rB3 = *pB3;            m0 = rA0 * rB0;            m1 = rA0 * rB1;            m2 = rA0 * rB2;            m3 = rA0 * rB3;            rA0 = pA0[1];            rB0 = pB0[1];            rB1 = pB1[1];            rB2 = pB2[1];            rB3 = pB3[1];            pA0 += (incAk0);            pB0 += (incBk0);            pB1 += (incBk0);            pB2 += (incBk0);            pB3 += (incBk0);            for (k=Kloop; k; k--) /* easy loop to unroll */            {               rC0_0 += m0;               m0 = rA0 * rB0;               rC0_1 += m1;               m1 = rA0 * rB1;               rC0_2 += m2;               m2 = rA0 * rB2;               rC0_3 += m3;               m3 = rA0 * rB3;               rA0 = pA0[1];               rB0 = pB0[1];               rB1 = pB1[1];               rB2 = pB2[1];               rB3 = pB3[1];               pA0 += incAk;               pB0 += incBk;               pB1 += incBk;               pB2 += incBk;               pB3 += incBk;            }/* *          Drain pipe on last iteration of K-loop */            rC0_0 += m0;            m0 = rA0 * rB0;            rC0_1 += m1;            m1 = rA0 * rB1;            rC0_2 += m2;            m2 = rA0 * rB2;            rC0_3 += m3;            m3 = rA0 * rB3;            rC0_0 += m0;            rC0_1 += m1;            rC0_2 += m2;            rC0_3 += m3;            pA0 += incAk0;            pB0 += incBk0;            pB1 += incBk0;            pB2 += incBk0;            pB3 += incBk0;            *pC0 = rC0_0;            *pC1 = rC0_1;            *pC2 = rC0_2;            *pC3 = rC0_3;            pC0 += incCm;            pC1 += incCm;            pC2 += incCm;            pC3 += incCm;            pA0 += incAm;            pB0 += incBm;            pB1 += incBm;            pB2 += incBm;            pB3 += incBm;         }         while(pA0 != stM);         pC0 += incCn;         pC1 += incCn;         pC2 += incCn;         pC3 += incCn;         pA0 += incAn;         pB0 += incBn;         pB1 += incBn;         pB2 += incBn;         pB3 += incBn;      }      while(pB0 != stN);   }   if (k=N-Nb)      ATL_mm1x1x1(M, k, K, alpha, ca, lda, cb + (Nb*ldb), ldb, beta,                  cc + (Nb*ldc2), ldc);}#ifdef ldc2   #undef ldc2#endif#ifdef incAm   #undef incAm#endif#ifdef incAn   #undef incAn#endif#ifdef incAk   #undef incAk#endif#ifdef incBm   #undef incBm#endif#ifdef incBn   #undef incBn#endif#ifdef incBk   #undef incBk#endif#ifdef incCm   #undef incCm#endif#ifdef incCn   #undef incCn#endif#ifdef incCk   #undef incCk#endif#ifdef Mb   #undef Mb#endif#ifdef Nb   #undef Nb#endif#ifdef Kb   #undef Kb#endif#endif#ifdef CLEANING_Nstatic void ATL_mm4x1x1   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=4, nu=1, ku=1 */{   const int Mb = (M>>2)<<2;   #define Nb N   #define Kb K   const int Kloop = K - 2;   const TYPE *ca=A, *cb=B;   TYPE *cc=C;   const TYPE *stM = A + (lda*Mb);   const TYPE *stN = B + (ldb*Nb);   #define incAk 1   const int incAm = ((((lda) << 2)) - Kb), incAn = -(Mb*lda);   #define incBk 1   const int incBm = -(Kb), incBn = (ldb);   #define incAk0 incAk   #define incBk0 incBk   #ifdef TREAL      #define incCm 4      const int incCn = (ldc) - (Mb);   #else      #define incCm 8      const int incCn = (ldc - Mb)<<1;   #endif   TYPE *pC0=C;   const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda);   const TYPE *pB0=B;   register int k;   register TYPE rA0, rA1, rA2, rA3;   register TYPE rB0;   register TYPE m0, m1, m2, m3;   register TYPE rC0_0, rC1_0, rC2_0, rC3_0;   if (K < 3)   {      ATL_mm1x1x1(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);      return;   }   if (pA0 != stM)   {      do /* N-loop */      {         do /* M-loop */         {            #ifdef BETA0               rC0_0 = rC1_0 = rC2_0 = rC3_0 = ATL_rzero;            #else               #ifdef TREAL                  rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];               #else                  rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];               #endif               #ifdef BETAX                  rA3 = beta;                  rC0_0 *= rA3; rC1_0 *= rA3; rC2_0 *= rA3; rC3_0 *= rA3;               #endif            #endif/* *          Start pipeline

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -