atl_mm4x4x2us.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 497 行
C
497 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 1999 The Australian National University * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * *//************************************************************************//* Level 3 BLAS - UltraSPARC-tuned Matrix-Matrix multiply *//************************************************************************//* this file contains a LI ATLAS dgemm() implementation * with code optimized for the UltraSPARC I & II architectures. * It should be compiled with gcc 2.8.1 or later: * gcc -mcpu=ultrasparc -O -fomit-frame-pointer -mtune=ultrasparc ... * to preserve the pipelining of operations. * * authors: * Viet Nguyen (Feb 1999): dgemm() and matrix multiply routines * Peter Strazdins (1998, Aug 2000): support routines and packaging * from the Department of Computer Science, Australian National University * * Copyright (C) 1998-2000 The Australian National University * * Modified by Clint Whaley on Oct 2000 to add precision independence, * and use of ATLAS infrastructure. */#include "atlas_misc.h"#if !defined(KB) || (KB == 1) || (KB == 0)static void ATL_myger(const M, const N, const TYPE *X, const TYPE *Y, const TYPE beta, TYPE *C, const int ldc){ const TYPE *stY = Y + N; #ifdef TCPLX const int ldc2 = ldc<<1; #define incC 2 #else #define ldc2 ldc #define incC 1 #endif do { #ifdef BETAX Mjoin(PATLU,axpby)(M, *Y++, X, 1, beta, C, incC); #else Mjoin(PATLU,axpy)(M, *Y++, X, 1, C, incC); #endif C += ldc2; } while (Y != stY);}#undef incC#ifdef ldc2 #undef ldc2#endif#endifstatic inline void __InnerLoop_(int K, const TYPE *A, int CIncA, const TYPE *B, int RIncB, const TYPE beta, TYPE *C, int LDC);/***************** main driver routines *******************************/void ATL_USERMM(int m, int n, int k, const TYPE alpha, const TYPE *A, const int LdA, const TYPE *B, const int LdB, const TYPE beta, TYPE *C, const int LdC ){ int i, j, msize, ki; const int incA=LdA<<2, incB=LdB<<2, incC=(LdC<<2)SHIFT; #if !defined(KB) || (KB == 1) || (KB == 0) if (k == 1) { ATL_myger(m, n, A, B, beta, C, LdC); return; } #endif while (m>0) { /* while : partition level */ const TYPE *a, *b; TYPE *c; c = C; b = B; msize = m; for (i=-n; i<0; i+=4) { TYPE *cx; cx = c; a = A; for (j=-msize; j<0; j+=4) { __InnerLoop_(k, a, LdA, b, LdB, beta, cx, (LdC SHIFT)); a += incA; cx += 4 SHIFT; } /* for j */ b += incB; c += incC; } /* for i */ A += msize*LdA; C += msize SHIFT; m -= msize; } /* while m>0 */}static inline void __InnerLoop_(int K, const TYPE *A, int CIncA, const TYPE *B, int RIncB, TYPE beta, TYPE *C, int LDC)/* * Post: C^T = B^T * A^T * Pre: C^T is row major, LDC is RIncC * RIncA (CIncA) is row(column) inc of A^T. * Similar for RIncB, CIncB. * K > 1 */{ register TYPE c00, c01, c02, c03, c10, c11, c12, c13; register TYPE c20, c21, c22, c23, c30, c31, c32, c33; register TYPE a0, a1, a2, a3, a0a, a1a, a2a, a3a; register TYPE b0, b1, b2, b3; register TYPE t0, t1, t2, t3; const register TYPE *A0, *A2, *B0, *B2;#ifdef BETA0c00 = c01 = c02 = c03 = c10 = c11 = c12 = c13 = c20 = c21 = c22 = c23 = c30 = c31 = c32 = c33 = 0.0;C += 3*LDC;A0 = A; A2 = A0 + (CIncA<<1);B0 = B; B2 = B0 + (RIncB<<1);#else c00 = *C; c01 = C[1 SHIFT]; A0 = A; c02 = C[2 SHIFT]; c03 = C[3 SHIFT]; C += LDC; c10 = *C; c11 = C[1 SHIFT]; A2 = A0 + (CIncA<<1); c12 = C[2 SHIFT]; c13 = C[3 SHIFT]; C += LDC; c20 = *C; c21 = C[1 SHIFT]; B0 = B; c22 = C[2 SHIFT]; c23 = C[3 SHIFT]; C += LDC; c30 = *C; c31 = C[1 SHIFT]; B2 = B0 + (RIncB<<1); c32 = C[2 SHIFT]; c33 = C[3 SHIFT];#ifdef BETAX a0 = beta; c00 *= a0; c10 *= a0; c20 *= a0; c30 *= a0; c01 *= a0; c11 *= a0; c21 *= a0; c31 *= a0; c02 *= a0; c12 *= a0; c22 *= a0; c32 *= a0; c03 *= a0; c13 *= a0; c23 *= a0; c33 *= a0;#endif#endif a0 = *A0; a1 = *(A0+CIncA); A0++; b0 = *B0; b1 = *(B0+RIncB); B0++; a2 = *A2; a3 = *(A2+CIncA); A2++; b2 = *B2; b3 = *(B2+RIncB); B2++; a0a = *A0; t0 = b0*a0; a1a = *(A0+CIncA); A0++; t1 = b0*a1; a2a = *A2; t2 = b0*a2; a3a = *(A2+CIncA); A2++; for (K=-K+3; K<0; K+=2) { t3 = b0*a3; b0 = *B0; c00 += t0; t0 = b1*a0; c01 += t1; t1 = b1*a1; c02 += t2; t2 = b1*a2; c03 += t3; t3 = b1*a3; b1 = *(B0+RIncB); c10 += t0; t0 = b2*a0; B0++; c11 += t1; t1 = b2*a1; c12 += t2; t2 = b2*a2; c13 += t3; t3 = b2*a3; b2 = *B2; c20 += t0; t0 = b3*a0; a0 = *A0; c21 += t1; t1 = b3*a1; c22 += t2; t2 = b3*a2; a1 = *(A0+CIncA); c23 += t3; t3 = b3*a3; b3 = *(B2+RIncB); c30 += t0; t0 = b0*a0a; A0++; c31 += t1; t1 = b0*a1a; a2 = *A2; c32 += t2; t2 = b0*a2a; B2++; c33 += t3; t3 = b0*a3a; a3 = *(A2+CIncA); c00 += t0; t0 = b1*a0a; A2++; c01 += t1; t1 = b1*a1a; b0 = *B0; c02 += t2; t2 = b1*a2a; c03 += t3; t3 = b1*a3a; b1 = *(B0+RIncB); c10 += t0; t0 = b2*a0a; B0++; c11 += t1; t1 = b2*a1a; c12 += t2; t2 = b2*a2a; c13 += t3; t3 = b2*a3a; b2 = *B2; c20 += t0; t0 = b3*a0a; a0a = *A0; c21 += t1; t1 = b3*a1a; a1a = *(A0+CIncA); c22 += t2; t2 = b3*a2a; a2a = *A2; A0++; c23 += t3; t3 = b3*a3a; b3 = *(B2+RIncB); c30 += t0; t0 = b0*a0; a3a = *(A2+CIncA); B2++; c31 += t1; t1 = b0*a1; A2++; c32 += t2; t2 = b0*a2; c33 += t3; } /* for */ t3 = b0*a3; b0 = *B0; c00 += t0; t0 = b1*a0; c01 += t1; t1 = b1*a1; c02 += t2; t2 = b1*a2; c03 += t3; t3 = b1*a3; b1 = *(B0+RIncB); c10 += t0; t0 = b2*a0; B0++; c11 += t1; t1 = b2*a1; c12 += t2; t2 = b2*a2; c13 += t3; if (K) { t3 = b2*a3; b2 = *B2; c20 += t0; t0 = b3*a0; c21 += t1; t1 = b3*a1; c22 += t2; t2 = b3*a2; c23 += t3; t3 = b3*a3; b3 = *(B2+RIncB); c30 += t0; t0 = b0*a0a; c31 += t1; t1 = b0*a1a; B2++; c32 += t2; t2 = b0*a2a; C -= LDC; c33 += t3; t3 = b0*a3a; C -= LDC; c00 += t0; t0 = b1*a0a; C -= LDC; c01 += t1; t1 = b1*a1a; c02 += t2; t2 = b1*a2a; *C = c00; c03 += t3; t3 = b1*a3a; C[1 SHIFT] = c01; c10 += t0; t0 = b2*a0a; C[2 SHIFT] = c02; c11 += t1; t1 = b2*a1a; C[3 SHIFT] = c03; C += LDC; c12 += t2; t2 = b2*a2a; *C = c10; c13 += t3; t3 = b2*a3a; C[1 SHIFT] = c11; c20 += t0; t0 = b3*a0a; C[2 SHIFT] = c12; c21 += t1; t1 = b3*a1a; C[3 SHIFT] = c13; C += LDC; c22 += t2; t2 = b3*a2a; *C = c20; c23 += t3; t3 = b3*a3a; C[1 SHIFT] = c21; c30 += t0; C[2 SHIFT] = c22; c31 += t1; C[3 SHIFT] = c23; C += LDC; c32 += t2; *C = c30; c33 += t3; C[1 SHIFT] = c31; C[2 SHIFT] = c32; C[3 SHIFT] = c33; } else { t3 = b2*a3; b2 = *B2; c20 += t0; t0 = b3*a0; a0 = *A0; c21 += t1; t1 = b3*a1; c22 += t2; t2 = b3*a2; a1 = *(A0+CIncA); c23 += t3; t3 = b3*a3; b3 = *(B2+RIncB); c30 += t0; t0 = b0*a0a; A0++; c31 += t1; t1 = b0*a1a; a2 = *A2; c32 += t2; t2 = b0*a2a; B2++; c33 += t3; t3 = b0*a3a; a3 = *(A2+CIncA); c00 += t0; t0 = b1*a0a; A2++; c01 += t1; t1 = b1*a1a; b0 = *B0; c02 += t2; t2 = b1*a2a; c03 += t3; t3 = b1*a3a; b1 = *(B0+RIncB); c10 += t0; t0 = b2*a0a; B0++; c11 += t1; t1 = b2*a1a; c12 += t2; t2 = b2*a2a; c13 += t3; t3 = b2*a3a; b2 = *B2; c20 += t0; t0 = b3*a0a; c21 += t1; t1 = b3*a1a; c22 += t2; t2 = b3*a2a; c23 += t3; t3 = b3*a3a; b3 = *(B2+RIncB); c30 += t0; t0 = b0*a0; B2++; c31 += t1; t1 = b0*a1; c32 += t2; t2 = b0*a2; c33 += t3; t3 = b0*a3; C -= LDC; c00 += t0; t0 = b1*a0; C -= LDC; c01 += t1; t1 = b1*a1; C -= LDC; c02 += t2; t2 = b1*a2; *C = c00; c03 += t3; t3 = b1*a3; C[1 SHIFT] = c01; c10 += t0; t0 = b2*a0; C[2 SHIFT] = c02; c11 += t1; t1 = b2*a1; C[3 SHIFT] = c03; C += LDC; c12 += t2; t2 = b2*a2; *C = c10; c13 += t3; t3 = b2*a3; C[1 SHIFT] = c11; c20 += t0; t0 = b3*a0; C[2 SHIFT] = c12; c21 += t1; t1 = b3*a1; C[3 SHIFT] = c13; C += LDC; c22 += t2; t2 = b3*a2; *C = c20; c23 += t3; t3 = b3*a3; C[1 SHIFT] = c21; c30 += t0; C[2 SHIFT] = c22; c31 += t1; C[3 SHIFT] = c23; C += LDC; c32 += t2; *C = c30; c33 += t3; C[1 SHIFT] = c31; C[2 SHIFT] = c32; C[3 SHIFT] = c33; } /* if (k) */} /* __InnerLoop_() */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?