📄 atl_mm4x4x2us_mn.c
字号:
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2000 R. Clint Whaley * * Code contributers : R. Clint Whaley, Viet Nguyen and Peter Strazdins * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * *//* * NOTE: This is a direct adaption of Viet Nguyen's and Peter Strazdin's * ATL_mm4x4x2US.c code for fixed NB. I'm not sure who should hold * the copyright in such a case; essentially, I did the typing, but it * completely uses their design for the inner kernel. This file handles * M or N loop cleanup of arbitrary dimension. */#include <atlas_misc.h>#if !defined(KB) || KB == 0 || (KB/2)*2 != KB #error KB must be nonzero multiple of 2#endif#if !defined(NB) || NB == 0 #define CLEANING_N#elif ( (NB/4)*4 != NB ) #error NB must be multiple of 4#endif#if !defined(MB) || MB == 0 #define CLEANING_M#elif ( (MB/4)*4 != MB ) #error MB must be multiple of 4#endif#if defined(CLEANING_M) && defined(CLEANING_N) #error One of MB and NB must be defined#endif#if defined(CLEANING_M) || defined(CLEANING_N)static void ATL_mm1x1x1 (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=1, nu=1, ku=1 */{ #define Mb M #define Nb N #define Kb K const TYPE *stM = A + (lda*Mb); const TYPE *stN = B + (ldb*Nb); const int incAm = ((lda) - Kb), incAn = -(Mb*lda); const int incBm = -(Kb), incBn = (ldb); #ifdef TREAL #define incCm 1 const int incCn = (ldc) - (Mb); #else #define incCm 2 const int incCn = (ldc - Mb)<<1; #endif TYPE *pC0=C; const TYPE *pA0=A; const TYPE *pB0=B; register int k; register TYPE rA0; register TYPE rB0; register TYPE rC0_0; do /* N-loop */ { do /* M-loop */ { #ifdef BETA0 rC0_0 = ATL_rzero; #elif defined(BETA1) rC0_0 = *pC0; #else rC0_0 = *pC0 * beta; #endif for (k=K; k; k--) /* easy loop to unroll */ { rA0 = *pA0++; rB0 = *pB0++; rC0_0 += rA0 * rB0; } *pC0 = rC0_0; pC0 += incCm; pA0 += incAm; pB0 += incBm; } while(pA0 != stM); pC0 += incCn; pA0 += incAn; pB0 += incBn; } while(pB0 != stN);}#ifdef incCm #undef incCm#endif#ifdef Mb #undef Mb#endif#ifdef Nb #undef Nb#endif#ifdef Kb #undef Kb#endif#endif#ifdef CLEANING_Mstatic void ATL_mm1x4x1 (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=1, nu=4, ku=1 */{ #define Mb M const int Nb = (N>>2)<<2; #define Kb K const int Kloop = K - 2; const TYPE *ca=A, *cb=B; TYPE *cc=C; const TYPE *stM = A + (lda*Mb); const TYPE *stN = B + (ldb*Nb); #define incAk 1 const int incAm = ((lda) - Kb), incAn = -(Mb*lda); #define incBk 1 const int incBm = -(Kb), incBn = (((ldb) << 2)); #define incAk0 incAk #define incBk0 incBk #ifdef TREAL #define incCm 1 #define ldc2 ldc const int incCn = (((ldc) << 2)) - (Mb); #else #define incCm 2 const int incCn = ((((ldc) << 2)) - (Mb))<<1, ldc2=ldc<<1; #endif TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2); const TYPE *pA0=A; const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb); register int k; register TYPE rA0; register TYPE rB0, rB1, rB2, rB3; register TYPE m0, m1, m2, m3; register TYPE rC0_0, rC0_1, rC0_2, rC0_3; if (K < 3) { ATL_mm1x1x1(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); return; } if (pB0 != stN) { do /* N-loop */ { do /* M-loop */ { #ifdef BETA0 rC0_0 = rC0_1 = rC0_2 = rC0_3 = ATL_rzero; #else rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3; #ifdef BETAX rB3 = beta; rC0_0 *= rB3; rC0_1 *= rB3; rC0_2 *= rB3; rC0_3 *= rB3; #endif #endif/* * Start pipeline */ rA0 = *pA0; rB0 = *pB0; rB1 = *pB1; rB2 = *pB2; rB3 = *pB3; m0 = rA0 * rB0; m1 = rA0 * rB1; m2 = rA0 * rB2; m3 = rA0 * rB3; rA0 = pA0[1]; rB0 = pB0[1]; rB1 = pB1[1]; rB2 = pB2[1]; rB3 = pB3[1]; pA0 += (incAk0); pB0 += (incBk0); pB1 += (incBk0); pB2 += (incBk0); pB3 += (incBk0); for (k=Kloop; k; k--) /* easy loop to unroll */ { rC0_0 += m0; m0 = rA0 * rB0; rC0_1 += m1; m1 = rA0 * rB1; rC0_2 += m2; m2 = rA0 * rB2; rC0_3 += m3; m3 = rA0 * rB3; rA0 = pA0[1]; rB0 = pB0[1]; rB1 = pB1[1]; rB2 = pB2[1]; rB3 = pB3[1]; pA0 += incAk; pB0 += incBk; pB1 += incBk; pB2 += incBk; pB3 += incBk; }/* * Drain pipe on last iteration of K-loop */ rC0_0 += m0; m0 = rA0 * rB0; rC0_1 += m1; m1 = rA0 * rB1; rC0_2 += m2; m2 = rA0 * rB2; rC0_3 += m3; m3 = rA0 * rB3; rC0_0 += m0; rC0_1 += m1; rC0_2 += m2; rC0_3 += m3; pA0 += incAk0; pB0 += incBk0; pB1 += incBk0; pB2 += incBk0; pB3 += incBk0; *pC0 = rC0_0; *pC1 = rC0_1; *pC2 = rC0_2; *pC3 = rC0_3; pC0 += incCm; pC1 += incCm; pC2 += incCm; pC3 += incCm; pA0 += incAm; pB0 += incBm; pB1 += incBm; pB2 += incBm; pB3 += incBm; } while(pA0 != stM); pC0 += incCn; pC1 += incCn; pC2 += incCn; pC3 += incCn; pA0 += incAn; pB0 += incBn; pB1 += incBn; pB2 += incBn; pB3 += incBn; } while(pB0 != stN); } if (k=N-Nb) ATL_mm1x1x1(M, k, K, alpha, ca, lda, cb + (Nb*ldb), ldb, beta, cc + (Nb*ldc2), ldc);}#ifdef ldc2 #undef ldc2#endif#ifdef incAm #undef incAm#endif#ifdef incAn #undef incAn#endif#ifdef incAk #undef incAk#endif#ifdef incBm #undef incBm#endif#ifdef incBn #undef incBn#endif#ifdef incBk #undef incBk#endif#ifdef incCm #undef incCm#endif#ifdef incCn #undef incCn#endif#ifdef incCk #undef incCk#endif#ifdef Mb #undef Mb#endif#ifdef Nb #undef Nb#endif#ifdef Kb #undef Kb#endif#endif#ifdef CLEANING_Nstatic void ATL_mm4x1x1 (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=4, nu=1, ku=1 */{ const int Mb = (M>>2)<<2; #define Nb N #define Kb K const int Kloop = K - 2; const TYPE *ca=A, *cb=B; TYPE *cc=C; const TYPE *stM = A + (lda*Mb); const TYPE *stN = B + (ldb*Nb); #define incAk 1 const int incAm = ((((lda) << 2)) - Kb), incAn = -(Mb*lda); #define incBk 1 const int incBm = -(Kb), incBn = (ldb); #define incAk0 incAk #define incBk0 incBk #ifdef TREAL #define incCm 4 const int incCn = (ldc) - (Mb); #else #define incCm 8 const int incCn = (ldc - Mb)<<1; #endif TYPE *pC0=C; const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda); const TYPE *pB0=B; register int k; register TYPE rA0, rA1, rA2, rA3; register TYPE rB0; register TYPE m0, m1, m2, m3; register TYPE rC0_0, rC1_0, rC2_0, rC3_0; if (K < 3) { ATL_mm1x1x1(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); return; } if (pA0 != stM) { do /* N-loop */ { do /* M-loop */ { #ifdef BETA0 rC0_0 = rC1_0 = rC2_0 = rC3_0 = ATL_rzero; #else #ifdef TREAL rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3]; #else rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6]; #endif #ifdef BETAX rA3 = beta; rC0_0 *= rA3; rC1_0 *= rA3; rC2_0 *= rA3; rC3_0 *= rA3; #endif #endif/* * Start pipeline
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -