⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_mmjki.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
字号:
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2006 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#include "atlas_lvl3.h"#include <stdlib.h>void Mjoin(PATL,mm_axpy)   (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, const int M,    const int N, const int K, const SCALAR alpha, const TYPE *A, const int lda,    const TYPE *B, const int ldb, const SCALAR beta, TYPE *C, const int ldc)/* * TA == AtlasNoTrans * GEMM implemented by calling axpy, with any M partitioning already done */{   int i, j, k, incBk, incBn;   const int incAn = lda*K;   if (TB == AtlasNoTrans)   {      incBk = 1;      incBn = ldb - K;   }   else   {      incBk = ldb;      incBn = 1 - ldb*K;   }   if (beta == ATL_rone)   {      if (alpha == ATL_rone)      {         for(j=0; j < N; j++)         {            for (k=0; k < K; k++, B += incBk, A += lda)               Mjoin(PATL,axpy)(M, *B, A, 1, C, 1);            C += ldc;            B += incBn;            A -= incAn;         }      }      else      {         for(j=0; j < N; j++)         {            Mjoin(PATL,axpby)(M, alpha*(*B), A, 1, beta, C, 1);            B += incBk;            A += lda;            for (k=1; k < K; k++, B += incBk, A += lda)               Mjoin(PATL,axpy)(M, alpha*(*B), A, 1, C, 1);            C += ldc;            B += incBn;            A -= incAn;         }      }   }   else /* BETA != 1.0 */   {      if (alpha == ATL_rone)      {         for(j=0; j < N; j++)         {            Mjoin(PATL,axpby)(M, *B, A, 1, beta, C, 1);            B += incBk;            A += lda;            for (k=1; k < K; k++, B += incBk, A += lda)               Mjoin(PATL,axpy)(M, *B, A, 1, C, 1);            C += ldc;            B += incBn;            A -= incAn;         }      }      else      {         for(j=0; j < N; j++)         {            Mjoin(PATL,axpby)(M, alpha*(*B), A, 1, beta, C, 1);            B += incBk;            A += lda;            for (k=1; k < K; k++, B += incBk, A += lda)               Mjoin(PATL,axpy)(M, alpha*(*B), A, 1, C, 1);            C += ldc;            B += incBn;            A -= incAn;         }      }   }}int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,                      const int M, const int N, const int K,                      const SCALAR alpha, const TYPE *A, const int lda,                      const TYPE *B, const int ldb, const SCALAR beta,                      TYPE *C, const int ldc)/* * This gemm is for small K, so we build gemm out of AXPY (outer product) * rather than dot (inner product). */{   int Mp, mp, m, k, ldaa=lda;   void *vA=NULL;   TYPE *pA;   SCALAR alp=alpha;/* * Compute M partition necessary to promote reuse in the L1 cache.  Check * NB^2 in addition to L1elts, to catch machines where L1 is not used by FPU. * If this gives a small Mp, use CacheEdge instead (reuse in L2 instead of L1). */   Mp = NB*NB;   if (ATL_L1elts > Mp) Mp = ATL_L1elts;   Mp /= (K+2);   if (Mp < 256)   {      #if !defined(CacheEdge) || CacheEdge == 0         Mp = M;      #else         Mp = (CacheEdge) / ((K+2)*ATL_sizeof);         if (Mp < 256)            Mp = M;      #endif   }   if (Mp > M)      Mp = M;/* * Change Mp if remainder is very small */   else   {      Mp -= 16;      /* small safety margin on filling cache */      mp = M / Mp;      m = M - mp*Mp;      if (m && m < 32)         Mp += (m+mp-1)/mp;   }/* * If A not in NoTrans format, need to copy so it can use axpy wt stride=1. * NOTE: this routine should not be called when you can't afford this copy */   if (TA != AtlasNoTrans)   {      vA = malloc(ATL_Cachelen + Mp*ATL_MulBySize(K));      if (!vA) return(-1);      pA = ATL_AlignPtr(vA);      alp = ATL_rone;      ldaa = Mp;      pA += Mp;   }   else      pA = (TYPE *) A;   for (m=0; m < M; m += Mp)   {      mp = M - m;      if (mp > Mp)         mp = Mp;/* *    If the thing is in Trans format, copy to NoTrans */      if (vA)      {         pA -= Mp;         for (k=0; k < K; k++)            Mjoin(PATL,cpsc)(mp, alpha, A+k, lda, pA+k*ldaa, 1);         A += mp*lda;      }      Mjoin(PATL,mm_axpy)(AtlasNoTrans, TB, mp, N, K, alp, pA, ldaa, B, ldb,                          beta, C, ldc);      pA += mp;      C += mp;   }   if (vA) free(vA);   return(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -