atl_mm6x8x8_1p.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,986 行 · 第 1/5 页
C
1,986 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2000 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"/* * prefetch actually slows down complex, so don't do it */#if defined(TCPLX) && defined(ATL_ARCH_IA64Itan) #undef ATL_ARCH_IA64Itan#endif#include "atlas_prefetch.h"#if !defined(MB) || MB == 0 #define ATL_CleanM#elif ( (MB/6)*6 != MB ) #error MB must be multiple of 6!!#endif#if !defined(NB) || NB == 0 #define ATL_CleanN#elif ( (NB/8)*8 != NB ) #error NB must be multiple of 8!!#endif#if !defined(KB) || KB == 0 #define ATL_CleanK#elif ( (KB/8)*8 != KB ) #error KB must be multiple of 8!!#endif#ifdef ATL_CleanM#if defined(CleanN) || defined(CleanK) #error Can clean only one dimension at a time!!#endifstatic void CleanM (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)/* * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, * lda=0, ldb=0, ldc=0, mu=6, nu=8, ku=2 */{ const int Nb = (N>>3)<<3; const int Kb = (K>>3)<<3; #define PFD KB6 const int Kstart = (K>>3) - 1; #define PFB 16 const TYPE *stN = B + (ldb*Nb); const int incAn = -K, incBn = (ldb<<3) - K, incCn = (ldc<<3)SHIFT; #ifdef TREAL #define ldc2 ldc #else const int ldc2=ldc<<1; #endif TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2), *pC4=pC3+(ldc2), *pC5=pC4+(ldc2), *pC6=pC5+(ldc2), *pC7=pC6+(ldc2); const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda), *pA4=pA3+(lda), *pA5=A; const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb), *pB4=pB3+(ldb), *pB5=pB4+(ldb), *pB6=pB5+(ldb), *pB7=pB6+ldb; register int k; #ifdef BETAX TYPE *bp = (TYPE *) β #endif register TYPE rA0, rA1, rA2, rA3, rA4, rA5; register TYPE ra0, ra1, ra2, ra3, ra4, ra5; register TYPE rB0, rB1, rB2, rB3, rB4, rB5, rB6, rB7; register TYPE rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC4_0, rC5_0, rC0_1, rC1_1, rC2_1, rC3_1, rC4_1, rC5_1, rC0_2, rC1_2, rC2_2, rC3_2, rC4_2, rC5_2, rC0_3, rC1_3, rC2_3, rC3_3, rC4_3, rC5_3, rC0_4, rC1_4, rC2_4, rC3_4, rC4_4, rC5_4, rC0_5, rC1_5, rC2_5, rC3_5, rC4_5, rC5_5, rC0_6, rC1_6, rC2_6, rC3_6, rC4_6, rC5_6, rC0_7, rC1_7, rC2_7, rC3_7, rC4_7, rC5_7; switch(M) { case 1: pA1 = A; case 2: pA2 = A; case 3: pA3 = A; case 4: pA4 = A; default:; } do /* N-loop */ { rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC4_0 = rC5_0 = rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC4_1 = rC5_1 = rC0_2 = rC1_2 = rC2_2 = rC3_2 = rC4_2 = rC5_2 = rC0_3 = rC1_3 = rC2_3 = rC3_3 = rC4_3 = rC5_3 = rC0_4 = rC1_4 = rC2_4 = rC3_4 = rC4_4 = rC5_4 = rC0_5 = rC1_5 = rC2_5 = rC3_5 = rC4_5 = rC5_5 = rC0_6 = rC1_6 = rC2_6 = rC3_6 = rC4_6 = rC5_6 = rC0_7 = rC1_7 = rC2_7 = rC3_7 = rC4_7 = rC5_7 = ATL_rzero; #ifndef BETA0 switch(M) { case 5: #ifdef TREAL rC4_0 = pC0[4]; rC4_1 = pC1[4]; rC4_2 = pC2[4]; rC4_3 = pC3[4]; rC4_4 = pC4[4]; rC4_5 = pC5[4]; rC4_6 = pC6[4]; rC4_7 = pC7[4]; #else rC4_0 = pC0[8]; rC4_1 = pC1[8]; rC4_2 = pC2[8]; rC4_3 = pC3[8]; rC4_4 = pC4[8]; rC4_5 = pC5[8]; rC4_6 = pC6[8]; rC4_7 = pC7[8]; #endif case 4: #ifdef TREAL rC3_0 = pC0[3]; rC3_1 = pC1[3]; rC3_2 = pC2[3]; rC3_3 = pC3[3]; rC3_4 = pC4[3]; rC3_5 = pC5[3]; rC3_6 = pC6[3]; rC3_7 = pC7[3]; #else rC3_0 = pC0[6]; rC3_1 = pC1[6]; rC3_2 = pC2[6]; rC3_3 = pC3[6]; rC3_4 = pC4[6]; rC3_5 = pC5[6]; rC3_6 = pC6[6]; rC3_7 = pC7[6]; #endif case 3: #ifdef TREAL rC2_0 = pC0[2]; rC2_1 = pC1[2]; rC2_2 = pC2[2]; rC2_3 = pC3[2]; rC2_4 = pC4[2]; rC2_5 = pC5[2]; rC2_6 = pC6[2]; rC2_7 = pC7[2]; #else rC2_0 = pC0[4]; rC2_1 = pC1[4]; rC2_2 = pC2[4]; rC2_3 = pC3[4]; rC2_4 = pC4[4]; rC2_5 = pC5[4]; rC2_6 = pC6[4]; rC2_7 = pC7[4]; #endif case 2: #ifdef TREAL rC1_0 = pC0[1]; rC1_1 = pC1[1]; rC1_2 = pC2[1]; rC1_3 = pC3[1]; rC1_4 = pC4[1]; rC1_5 = pC5[1]; rC1_6 = pC6[1]; rC1_7 = pC7[1]; #else rC1_0 = pC0[2]; rC1_1 = pC1[2]; rC1_2 = pC2[2]; rC1_3 = pC3[2]; rC1_4 = pC4[2]; rC1_5 = pC5[2]; rC1_6 = pC6[2]; rC1_7 = pC7[2]; #endif default: rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2; rC0_3 = *pC3; rC0_4 = *pC4; rC0_5 = *pC5; rC0_6 = *pC6; rC0_7 = *pC7; } #ifdef BETAX rb7 = *bp; rC0_0 *= rb7; rC1_0 *= rb7; rC2_0 *= rb7; rC3_0 *= rb7; rC4_0 *= rb7; rC5_0 *= rb7; rC0_1 *= rb7; rC1_1 *= rb7; rC2_1 *= rb7; rC3_1 *= rb7; rC4_1 *= rb7; rC5_1 *= rb7; rC0_2 *= rb7; rC1_2 *= rb7; rC2_2 *= rb7; rC3_2 *= rb7; rC4_2 *= rb7; rC5_2 *= rb7; rC0_3 *= rb7; rC1_3 *= rb7; rC2_3 *= rb7; rC3_3 *= rb7; rC4_3 *= rb7; rC5_3 *= rb7; rC0_4 *= rb7; rC1_4 *= rb7; rC2_4 *= rb7; rC3_4 *= rb7; rC4_4 *= rb7; rC5_4 *= rb7; rC0_5 *= rb7; rC1_5 *= rb7; rC2_5 *= rb7; rC3_5 *= rb7; rC4_5 *= rb7; rC5_5 *= rb7; rC0_6 *= rb7; rC1_6 *= rb7; rC2_6 *= rb7; rC3_6 *= rb7; rC4_6 *= rb7; rC5_6 *= rb7; rC0_7 *= rb7; rC1_7 *= rb7; rC2_7 *= rb7; rC3_7 *= rb7; rC4_7 *= rb7; rC5_7 *= rb7; #endif #endif rA0 = *pA0++; rA1 = *pA1++; rA2 = *pA2++; rA3 = *pA3++; rA4 = *pA4++; rA5 = *pA5++; rB0 = *pB0++; rB1 = *pB1++; rB2 = *pB2++; rB3 = *pB3++; rB4 = *pB4++; rB5 = *pB5++; rB6 = *pB6++; rB7 = *pB7++; for (k=Kstart; k; k--) /* easy loop to unroll */ { rC0_0 += rA0 * rB0; rC1_0 += rA1 * rB0; ATL_pfl1R(pA0+PFD-1); rC2_0 += rA2 * rB0; rC3_0 += rA3 * rB0; rC4_0 += rA4 * rB0; rb0 = *pB0++; rC5_0 += rA5 * rB0; rC0_1 += rA0 * rB1; rC1_1 += rA1 * rB1; ra0 = *pA0++; rC2_1 += rA2 * rB1; rC3_1 += rA3 * rB1; rC4_1 += rA4 * rB1; ra1 = *pA1++; rC5_1 += rA5 * rB1; rC0_2 += rA0 * rB2; rC1_2 += rA1 * rB2; ra2 = *pA2++; rC2_2 += rA2 * rB2; rC3_2 += rA3 * rB2; rC4_2 += rA4 * rB2; ra3 = *pA3++; rC5_2 += rA5 * rB2; rC0_3 += rA0 * rB3; rC1_3 += rA1 * rB3; ra4 = *pA4++; rC2_3 += rA2 * rB3; rC3_3 += rA3 * rB3; rC4_3 += rA4 * rB3; ra5 = *pA5++; rC5_3 += rA5 * rB3; rC0_4 += rA0 * rB4; rC1_4 += rA1 * rB4; rb1 = *pB1++; rC2_4 += rA2 * rB4; rC3_4 += rA3 * rB4; rC4_4 += rA4 * rB4; rb2 = *pB2++; rC5_4 += rA5 * rB4; rC0_5 += rA0 * rB5; rC1_5 += rA1 * rB5; rb3 = *pB3++; rC2_5 += rA2 * rB5; rC3_5 += rA3 * rB5; ATL_pfl1R(pA1+PFD-2); rC4_5 += rA4 * rB5; rb4 = *pB4++; rC5_5 += rA5 * rB5; rC0_6 += rA0 * rB6; rC1_6 += rA1 * rB6; rb5 = *pB5++; rC2_6 += rA2 * rB6; rC3_6 += rA3 * rB6; rC4_6 += rA4 * rB6; rb6 = *pB6++; rC5_6 += rA5 * rB6; rC0_7 += rA0 * rB7; rC1_7 += rA1 * rB7; rb7 = *pB7++; rC2_7 += rA2 * rB7; rC3_7 += rA3 * rB7; rC4_7 += rA4 * rB7; rB0 = *pB0++; rC5_7 += rA5 * rB7; rC0_0 += ra0 * rb0; rC1_0 += ra1 * rb0; rA0 = *pA0++; rC2_0 += ra2 * rb0; rC3_0 += ra3 * rb0; rC4_0 += ra4 * rb0; rA1 = *pA1++; rC5_0 += ra5 * rb0; rC0_1 += ra0 * rb1; rC1_1 += ra1 * rb1; rA2 = *pA2++; rC2_1 += ra2 * rb1; rC3_1 += ra3 * rb1; rC4_1 += ra4 * rb1; rA3 = *pA3++; rC5_1 += ra5 * rb1; rC0_2 += ra0 * rb2; rC1_2 += ra1 * rb2; rA4 = *pA4++; rC2_2 += ra2 * rb2; rC3_2 += ra3 * rb2; rC4_2 += ra4 * rb2; rA5 = *pA5++; rC5_2 += ra5 * rb2; rC0_3 += ra0 * rb3; rC1_3 += ra1 * rb3; rB1 = *pB1++; rC2_3 += ra2 * rb3; rC3_3 += ra3 * rb3; rC4_3 += ra4 * rb3; rB2 = *pB2++; rC5_3 += ra5 * rb3; rC0_4 += ra0 * rb4; rC1_4 += ra1 * rb4; rB3 = *pB3++; rC2_4 += ra2 * rb4; rC3_4 += ra3 * rb4; rC4_4 += ra4 * rb4; rB4 = *pB4++; rC5_4 += ra5 * rb4; rC0_5 += ra0 * rb5; rC1_5 += ra1 * rb5; rB5 = *pB5++; rC2_5 += ra2 * rb5; rC3_5 += ra3 * rb5; rC4_5 += ra4 * rb5; rB6 = *pB6++; rC5_5 += ra5 * rb5; rC0_6 += ra0 * rb6; rC1_6 += ra1 * rb6; rC2_6 += ra2 * rb6; rB7 = *pB7++; rC3_6 += ra3 * rb6; rC4_6 += ra4 * rb6; rC5_6 += ra5 * rb6; rC0_7 += ra0 * rb7; rC1_7 += ra1 * rb7; rC2_7 += ra2 * rb7; rC3_7 += ra3 * rb7; rC4_7 += ra4 * rb7; rC5_7 += ra5 * rb7; rC0_0 += rA0 * rB0; ATL_pfl1R(pA2+PFD-3); rC1_0 += rA1 * rB0; rC2_0 += rA2 * rB0; rC3_0 += rA3 * rB0; rC4_0 += rA4 * rB0; rb0 = *pB0++; rC5_0 += rA5 * rB0; rC0_1 += rA0 * rB1; rC1_1 += rA1 * rB1; ra0 = *pA0++; rC2_1 += rA2 * rB1; rC3_1 += rA3 * rB1; rC4_1 += rA4 * rB1; ra1 = *pA1++; rC5_1 += rA5 * rB1; rC0_2 += rA0 * rB2; rC1_2 += rA1 * rB2; ra2 = *pA2++; rC2_2 += rA2 * rB2; rC3_2 += rA3 * rB2; rC4_2 += rA4 * rB2; ra3 = *pA3++; rC5_2 += rA5 * rB2; rC0_3 += rA0 * rB3; rC1_3 += rA1 * rB3; ra4 = *pA4++; rC2_3 += rA2 * rB3; rC3_3 += rA3 * rB3; rC4_3 += rA4 * rB3; ra5 = *pA5++; rC5_3 += rA5 * rB3; rC0_4 += rA0 * rB4; rC1_4 += rA1 * rB4; rb1 = *pB1++; rC2_4 += rA2 * rB4; rC3_4 += rA3 * rB4; rC4_4 += rA4 * rB4; rb2 = *pB2++; rC5_4 += rA5 * rB4; rC0_5 += rA0 * rB5; rC1_5 += rA1 * rB5; rb3 = *pB3++; rC2_5 += rA2 * rB5; ATL_pfl1R(pA3+PFD-3); rC3_5 += rA3 * rB5; rC4_5 += rA4 * rB5; rb4 = *pB4++; rC5_5 += rA5 * rB5; rC0_6 += rA0 * rB6; rC1_6 += rA1 * rB6; rb5 = *pB5++; rC2_6 += rA2 * rB6; rC3_6 += rA3 * rB6; rC4_6 += rA4 * rB6; rb6 = *pB6++; rC5_6 += rA5 * rB6; rC0_7 += rA0 * rB7; rC1_7 += rA1 * rB7; rb7 = *pB7++; rC2_7 += rA2 * rB7; rC3_7 += rA3 * rB7; rC4_7 += rA4 * rB7; rB0 = *pB0++; rC5_7 += rA5 * rB7; rC0_0 += ra0 * rb0; rC1_0 += ra1 * rb0; rA0 = *pA0++; rC2_0 += ra2 * rb0; rC3_0 += ra3 * rb0; rC4_0 += ra4 * rb0; rA1 = *pA1++; rC5_0 += ra5 * rb0; rC0_1 += ra0 * rb1; rC1_1 += ra1 * rb1; rA2 = *pA2++; rC2_1 += ra2 * rb1; rC3_1 += ra3 * rb1; rC4_1 += ra4 * rb1; rA3 = *pA3++; rC5_1 += ra5 * rb1; rC0_2 += ra0 * rb2; rC1_2 += ra1 * rb2; rA4 = *pA4++; rC2_2 += ra2 * rb2; rC3_2 += ra3 * rb2; ATL_pfl1R(pA4+PFD-4); rC4_2 += ra4 * rb2; rA5 = *pA5++; rC5_2 += ra5 * rb2; rC0_3 += ra0 * rb3; rC1_3 += ra1 * rb3; rB1 = *pB1++; rC2_3 += ra2 * rb3; rC3_3 += ra3 * rb3; rC4_3 += ra4 * rb3; rB2 = *pB2++; rC5_3 += ra5 * rb3; rC0_4 += ra0 * rb4; rC1_4 += ra1 * rb4; rB3 = *pB3++; rC2_4 += ra2 * rb4; rC3_4 += ra3 * rb4; rC4_4 += ra4 * rb4; rB4 = *pB4++; rC5_4 += ra5 * rb4; rC0_5 += ra0 * rb5; rC1_5 += ra1 * rb5; rB5 = *pB5++; rC2_5 += ra2 * rb5; rC3_5 += ra3 * rb5; rC4_5 += ra4 * rb5; rB6 = *pB6++; rC5_5 += ra5 * rb5; rC0_6 += ra0 * rb6; rC1_6 += ra1 * rb6; rC2_6 += ra2 * rb6; rB7 = *pB7++; rC3_6 += ra3 * rb6; rC4_6 += ra4 * rb6; rC5_6 += ra5 * rb6; rC0_7 += ra0 * rb7; rC1_7 += ra1 * rb7; rC2_7 += ra2 * rb7; rC3_7 += ra3 * rb7; rC4_7 += ra4 * rb7; rC5_7 += ra5 * rb7; ATL_pfl1R(pA5+PFD-4); rC0_0 += rA0 * rB0; rC1_0 += rA1 * rB0; rC2_0 += rA2 * rB0; rC3_0 += rA3 * rB0; rC4_0 += rA4 * rB0; rb0 = *pB0++; rC5_0 += rA5 * rB0; rC0_1 += rA0 * rB1; rC1_1 += rA1 * rB1; ra0 = *pA0++; rC2_1 += rA2 * rB1; rC3_1 += rA3 * rB1; rC4_1 += rA4 * rB1; ra1 = *pA1++; rC5_1 += rA5 * rB1; rC0_2 += rA0 * rB2;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?