atl_smmmncu_av.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 484 行 · 第 1/2 页

C
484
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2001 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"#define ATL_NoFakePF#define ATL_AltiVec#include "atlas_prefetch.h"#ifndef KB   #error KB must be compile-time constant!#elif (KB / 16)*16 != KB || KB == 0   #error KB must be multiple of 16!#endif#ifndef MB   #define ATL_MNCLEAN   #define ATL_MCLEAN#elif (MB/4)*4 != MB || MB == 0   #define ATL_MNCLEAN   #define ATL_MCLEAN#endif#ifndef NB   #define ATL_MNCLEAN   #define ATL_NCLEAN#elif (NB/4)*4 != NB || NB == 0   #define ATL_MNCLEAN   #define ATL_NCLEAN#endif#define VecReorder(v0, v1, v2, v3) \{ \   vA0 = vec_mergeh(v0, v2); \   vA2 = vec_mergel(v0, v2); \   vA1 = vec_mergeh(v1, v3); \   vA3 = vec_mergel(v1, v3); \   v0  = vec_mergeh(vA0, vA1); \   v2  = vec_mergel(vA0, vA1); \   v1  = vec_mergeh(vA2, vA3); \   v3  = vec_mergel(vA2, vA3); \}#ifdef ATL_MNCLEANstaticvoid ATL_mmcu(const int M, const int N, const int K, const TYPE alpha,              const TYPE *A, const int lda, const TYPE *B, const int ldb,              const TYPE beta, TYPE *C, const int ldc, TYPE *tC, int cwrdKB)/* * Braindead cleanup; assumes java mode already turned on my caller */{   int i, j, k;   const int incCn = (ldc-M)SHIFT;   const float *pA0=A, *pB0=B;   register float rC0_0;   vector float vA0, vB0, vC0_0;   const vector float nzero =  VECTOR_INIT(-0.0f, -0.0f, -0.0f, -0.0f);   i = cwrdKB>>24; /* size */   j = (cwrdKB - (i<<24))>>16; /* count */   k = (cwrdKB - (i<<24) - (j<<16)); /* stride */   ATL_pfavR(A, ATL_GetCtrl(k, j*M, i), 3);   for (j=0; j != N; j++)   {      ATL_pfavR(pB0, cwrdKB, 2);      ATL_pfavW(C, cwrdKB, 1);      for (i=0; i != M; i++)      {         ATL_pfavR(pA0, cwrdKB, 0);         #ifdef BETA0            rC0_0 = ATL_rzero;         #else            rC0_0 = *C;         #endif         vC0_0 = nzero;         for (k=0; k != KB; k+= 4)         {            vA0 = vec_ld(0, pA0+k);            vB0 = vec_ld(0, pB0+k);            vC0_0 = vec_madd(vA0, vB0, vC0_0);         }         pA0 += KB;         vec_st(vC0_0, 0, tC);         #ifdef BETAX            *C++ = rC0_0 * beta + *tC + tC[1] + tC[2] + tC[3];         #else            *C++ = rC0_0 + *tC + tC[1] + tC[2] + tC[3];         #endif         #ifdef TCPLX            C++;         #endif      }      pB0 += KB;      pA0 = A;      C += incCn;   }}#endifvoid ATL_USERMM   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with muladd=1, TA=T, TB=N, mu=4, nu=4, ku=2, prefetching A and B */{   const int n4=(N>>2)<<2, m4 = (M>>2)<<2;   const TYPE *stM = A + KB*m4;   const TYPE *stN = B + KB*n4;   #ifdef ATL_AltiVec      int blkstride, cwrdKB, cwrdC=ATL_MulBySize(8);   #endif   const int incAn = -KB*m4;   const int incBm = -KB;   #define incAm KB3   #define incBn KB4   #ifdef TREAL      #define incCm 4      const int incCn = (((ldc) << 2)) - m4;   #else      #define incCm 8      const int incCn = (((ldc) << 3)) - (m4+m4);   #endif   const int kstart = (KB>>4)-1;   void *vC;   TYPE *tC;   TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT),*pC3=pC2+(ldc SHIFT);   const TYPE *pA0=A;   const TYPE *pB0=B;   register int k;   register TYPE rA0, rA1, rA2, rA3, ra0, ra1, ra2, ra3;   register TYPE rB0, rB1, rB2, rB3, rb0, rb1, rb2, rb3;   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1,                 rC0_2, rC1_2, rC2_2, rC3_2, rC0_3, rC1_3, rC2_3, rC3_3;   vector float vA0, vA1, vA2, vA3, vB0, vB1, vB2, vB3;   vector float  vC0_0, vC1_0, vC2_0, vC3_0, vC0_1, vC1_1, vC2_1, vC3_1,                 vC0_2, vC1_2, vC2_2, vC3_2, vC0_3, vC1_3, vC2_3, vC3_3;   const vector float nzero = VECTOR_INIT(-0.0f, -0.0f, -0.0f, -0.0f);   #ifndef ATL_NoIEEE /* turn on java/ieee mode */      #ifdef ATL_AVgcc         const vector int izero   =  VECTOR_INITI(0,0,0,0);         vec_mtvscr(izero);      #else         vec_mtvscr((vector unsigned long)(0));      #endif   #endif   vC = malloc(ATL_Cachelen + sizeof(float)*16);   ATL_assert(vC);   tC = ATL_AlignPtr(vC);   #ifdef ATL_AltiVec/* *    k is blkcount, cwrdKB is block size */      k = 1; /* blkcount set to 1 unless KB too large */      cwrdKB = (ATL_MulBySize(KB)+15) >> 4;  /* # of 16-byte words in KB */      while (cwrdKB > 32)      {         cwrdKB >>= 1;         k <<= 1;      }      if (cwrdKB == 32) cwrdKB = 0;      blkstride = (KB * sizeof(TYPE)) / k;      ATL_pfavR(A, ATL_GetCtrl(blkstride, k*KB, cwrdKB), 3);      cwrdKB = ATL_GetCtrl(blkstride, k, cwrdKB);      if (cwrdC >= 16) cwrdC >>= 4;      else cwrdC = 1;      cwrdC = ATL_GetCtrl(0, 1, cwrdC);   #endif   #ifdef ATL_MNCLEAN   if (pB0 != stN && pA0 != stM)   {   #endif   do /* N-loop */   {      ATL_pfavR(pB0, cwrdKB, 0);      ATL_pfavR(pB0+KB , cwrdKB, 1);      ATL_pfavR(pB0+KB2, cwrdKB, 2);      ATL_pfavR(pB0+KB3, cwrdKB, 3);      #ifdef ATL_MCLEAN      if (pA0 != stM)      {      #endif      do /* M-loop */      {         vC0_0 = vC1_0 = vC2_0 = vC3_0 =         vC0_1 = vC1_1 = vC2_1 = vC3_1 =         vC0_2 = vC1_2 = vC2_2 = vC3_2 =         vC0_3 = vC1_3 = vC2_3 = vC3_3 = nzero;         #ifdef BETA0            rC0_0 = rC1_0 = rC2_0 = rC3_0 =            rC0_1 = rC1_1 = rC2_1 = rC3_1 =            rC0_2 = rC1_2 = rC2_2 = rC3_2 =            rC0_3 = rC1_3 = rC2_3 = rC3_3 = ATL_rzero;         #else            #ifdef TREAL               rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];               rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];               rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];               rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[2]; rC3_3 = pC3[3];            #else               rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];               rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[4]; rC3_1 = pC1[6];               rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[4]; rC3_2 = pC2[6];               rC0_3 = *pC3; rC1_3 = pC3[2]; rC2_3 = pC3[4]; rC3_3 = pC3[6];            #endif         #endif         vA0 = vec_ld(0, pA0);         vA1 = vec_ld(0, pA0+KB);         vA2 = vec_ld(0, pA0+KB2);         vA3 = vec_ld(0, pA0+KB3); pA0 += 4;         vB0 = vec_ld(0, pB0);         vB1 = vec_ld(0, pB0+KB);         vB2 = vec_ld(0, pB0+KB2);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?