atl_trsmkl.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 282 行

C
282
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 1997 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_kern3.h"void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,trsmK),SideNM),UploNM),N),UnitNM)   (const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,    TYPE *B, const int ldb)/* * This source file is multiply compiled to create the following routines: * ATL_trsmKLUNU : trsm Side='Left', Uplo='Upper', Trans='N', Unit='Unit' * ATL_trsmKLUNN : trsm Side='Left', Uplo='Upper', Trans='N', Unit='NonUnit' * ATL_trsmKLLNU : trsm Side='Left', Uplo='Lower', Trans='N', Unit='Unit' * ATL_trsmKLLNN : trsm Side='Left', Uplo='Lower', Trans='N', Unit='NonUnit' * * The transpose cases are handled by a higher level routine that copies A */{   int inc, m, n = (N>>3)<<3;   register int k, i, j;   TYPE *X0=B, *X1=B+ldb, *X2=X1+ldb, *X3=X2+ldb, *X4=X3+ldb;   TYPE *X5=X4+ldb, *X6=X5+ldb, *X7=X6+ldb;   register TYPE x0, x1, x2, x3, x4, x5, x6, x7, a0;   const TYPE *a;   #ifdef Upper_      const int ldap1 = lda + 1;      const TYPE *Ad;   #endif   #ifndef UnitDiag_      const TYPE one=1.0;      void *vp;      TYPE *diag;   #endif/* * If non-unit, precompute inverse of diagonal entries */   #ifndef UnitDiag_      vp = malloc(ATL_Cachelen+ATL_MulBySize(M));      ATL_assert(vp);      diag = ATL_AlignPtr(vp);      a = A;      inc = lda + 1;      for (i=0; i != M; i++)      {         diag[i] = one / *a;         a += inc;      }   #endif   inc = ldb << 3;   for (j=0; j != n; j += 8)   {   #ifdef Upper_      Ad = A + M*lda+M-1;      for (i=M-1; i >= 0; i--)   #else      for (i=0; i != M; i++)   #endif      {         x0 = X0[i];         x0 *=  alpha;         x1 = X1[i];         x1 *=  alpha;         x2 = X2[i];         x2 *=  alpha;         #ifdef Upper_            a = Ad;            Ad -= ldap1;         #else            a = A + i;         #endif         x3 = X3[i];         x3 *=  alpha;         x4 = X4[i];         x4 *=  alpha;         x5 = X5[i];         x5 *=  alpha;         x6 = X6[i];         x6 *=  alpha;         x7 = X7[i];         x7 *=  alpha;      #ifdef Upper_         for (k=i+1; k != M; k++)      #else         for (k=0; k != i; k++)      #endif         {            a0 = *a;            x0 -= a0 * X0[k];            x1 -= a0 * X1[k];            a += lda;            x2 -= a0 * X2[k];            x3 -= a0 * X3[k];            x4 -= a0 * X4[k];            x5 -= a0 * X5[k];            x6 -= a0 * X6[k];            x7 -= a0 * X7[k];         }         #ifndef UnitDiag_            a0 = diag[i];            x0 *= a0;            x1 *= a0;            x2 *= a0;            x3 *= a0;            x4 *= a0;            x5 *= a0;            x6 *= a0;            x7 *= a0;         #endif         X0[i] = x0;         X1[i] = x1;         X2[i] = x2;         X4[i] = x4;         X3[i] = x3;         X5[i] = x5;         X6[i] = x6;         X7[i] = x7;      }      X0 += inc;      X1 += inc;      X2 += inc;      X3 += inc;      X4 += inc;      X5 += inc;      X6 += inc;      X7 += inc;   }   if ( n=N-n )   {      inc = lda << 3;      B = X0;      for (j=0; j != n; j++)  /* N-loop cleanup */      {      #ifdef Upper_         Ad = A + M*lda+M-1;         for (i=M-1; i >= 0; i--)      #else         for (i=0; i != M; i++)      #endif         {         #ifdef Upper_            X0 = (TYPE*) Ad;            Ad -= ldap1;         #else            X0 = (TYPE*) A+i;         #endif            X1 = X0 + lda;            X2 = X1+lda;            X3 = X2+lda;            X4 = X3+lda;            X5 = X4+lda;            X6 = X5+lda;            X7 = X6+lda;            x0 = B[i];            x0 *=  alpha;            x1 = x2 = x3 = x4 = x5 = x6 = x7 = 0.0;         #ifdef Upper_            k = i + 1;            m = M - k;            m = (m >> 3)<<3;            for (m += k; k != m; k += 8)         #else            m = (i >> 3)<<3;            for (k=0; k != m; k += 8)         #endif            {               x0 -= *X0 * B[k];               X0 += inc;               x1 -= *X1 * B[k+1];               X1 += inc;               x2 -= *X2 * B[k+2];               X2 += inc;               x3 -= *X3 * B[k+3];               X3 += inc;               x4 -= *X4 * B[k+4];               X4 += inc;               x5 -= *X5 * B[k+5];               X5 += inc;               x6 -= *X6 * B[k+6];               X6 += inc;               x7 -= *X7 * B[k+7];               X7 += inc;            }         #if Upper_            switch(M-m)         #else            switch(i-m)         #endif            {            case 1:               x0 -= *X0 * B[m];               break;            case 2:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               break;            case 3:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               x2 -= *X2 * B[m+2];               break;            case 4:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               x2 -= *X2 * B[m+2];               x3 -= *X3 * B[m+3];               break;            case 5:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               x2 -= *X2 * B[m+2];               x3 -= *X3 * B[m+3];               x4 -= *X4 * B[m+4];               break;            case 6:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               x2 -= *X2 * B[m+2];               x3 -= *X3 * B[m+3];               x4 -= *X4 * B[m+4];               x5 -= *X5 * B[m+5];               break;            case 7:               x0 -= *X0 * B[m];               x1 -= *X1 * B[m+1];               x2 -= *X2 * B[m+2];               x3 -= *X3 * B[m+3];               x4 -= *X4 * B[m+4];               x5 -= *X5 * B[m+5];               x6 -= *X6 * B[m+6];               break;            default:;            }            x0 += x1;            x2 += x3;            x4 += x5;            x6 += x7;            x0 += x2;            x4 += x6;            x0 += x4;            #ifndef UnitDiag_               x0 *= diag[i];            #endif            B[i] = x0;         }         B += ldb;      }   }   #ifndef UnitDiag_      free(vp);   #endif}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?