atl_mm8x8x2.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 529 行 · 第 1/2 页
C
529 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2000 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_misc.h"/* #undef ATL_ARCH_IA64Itan */#include "atlas_prefetch.h"void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc){ const int incAm = (lda<<3)-K, incAn = -M*lda; const int incBm = -K, incBn = (ldb<<3); const int Kstart = (K-2)>>1; #ifdef TREAL #define incCm 8 #define ldc2 ldc const int incCn = (ldc<<3) - M; #else #define incCm 16 const int ldc2 = ldc+ldc; const int incCn = (ldc<<4) - M - M; #endif TYPE *pC0=C, *pC1=C+ldc2, *pC2=pC1+ldc2, *pC3=pC2+ldc2, *pC4=pC3+ldc2, *pC5=pC4+ldc2, *pC6=pC5+ldc2, *pC7=pC6+ldc2; const TYPE *pA0=A, *pA1=pA0+lda, *pA2=pA1+lda, *pA3=pA2+lda, *pA4=pA3+lda, *pA5=pA4+lda, *pA6=pA5+lda, *pA7=pA6+lda; const TYPE *pB0=B, *pB1=pB0+ldb, *pB2=pB1+ldb, *pB3=pB2+ldb, *pB4=pB3+ldb, *pB5=pB4+ldb, *pB6=pB5+ldb, *pB7=pB6+ldb; const TYPE *stM = A + lda*M; const TYPE *stN = B + ldb*N; const TYPE *pfA = stM; #ifdef BETAX TYPE *bp = (TYPE *) β #endif register int k; register TYPE rA0, rA1, rA2, rA3, rA4, rA5, rA6, rA7; register TYPE ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; register TYPE rB0, rB1, rB2, rB3, rB4, rB5, rB6, rB7; register TYPE rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC4_0, rC5_0, rC6_0, rC7_0, rC0_1, rC1_1, rC2_1, rC3_1, rC4_1, rC5_1, rC6_1, rC7_1, rC0_2, rC1_2, rC2_2, rC3_2, rC4_2, rC5_2, rC6_2, rC7_2, rC0_3, rC1_3, rC2_3, rC3_3, rC4_3, rC5_3, rC6_3, rC7_3, rC0_4, rC1_4, rC2_4, rC3_4, rC4_4, rC5_4, rC6_4, rC7_4, rC0_5, rC1_5, rC2_5, rC3_5, rC4_5, rC5_5, rC6_5, rC7_5, rC0_6, rC1_6, rC2_6, rC3_6, rC4_6, rC5_6, rC6_6, rC7_6, rC0_7, rC1_7, rC2_7, rC3_7, rC4_7, rC5_7, rC6_7, rC7_7; do { do { #ifdef BETA0 rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC4_0 = rC5_0 = rC6_0 = rC7_0 = rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC4_1 = rC5_1 = rC6_1 = rC7_1 = rC0_2 = rC1_2 = rC2_2 = rC3_2 = rC4_2 = rC5_2 = rC6_2 = rC7_2 = rC0_3 = rC1_3 = rC2_3 = rC3_3 = rC4_3 = rC5_3 = rC6_3 = rC7_3 = rC0_4 = rC1_4 = rC2_4 = rC3_4 = rC4_4 = rC5_4 = rC6_4 = rC7_4 = rC0_5 = rC1_5 = rC2_5 = rC3_5 = rC4_5 = rC5_5 = rC6_5 = rC7_5 = rC0_6 = rC1_6 = rC2_6 = rC3_6 = rC4_6 = rC5_6 = rC6_6 = rC7_6 = rC0_7 = rC1_7 = rC2_7 = rC3_7 = rC4_7 = rC5_7 = rC6_7 = rC7_7 = ATL_rzero; #else rC0_0 = *pC0; rC1_0 = pC0[1 SHIFT]; rC2_0 = pC0[2 SHIFT]; rC3_0 = pC0[3 SHIFT]; rC4_0 = pC0[4 SHIFT]; rC5_0 = pC0[5 SHIFT]; rC6_0 = pC0[6 SHIFT]; rC7_0 = pC0[7 SHIFT]; rC0_1 = *pC1; rC1_1 = pC1[1 SHIFT]; rC2_1 = pC1[2 SHIFT]; rC3_1 = pC1[3 SHIFT]; rC4_1 = pC1[4 SHIFT]; rC5_1 = pC1[5 SHIFT]; rC6_1 = pC1[6 SHIFT]; rC7_1 = pC1[7 SHIFT]; rC0_2 = *pC2; rC1_2 = pC2[1 SHIFT]; rC2_2 = pC2[2 SHIFT]; rC3_2 = pC2[3 SHIFT]; rC4_2 = pC2[4 SHIFT]; rC5_2 = pC2[5 SHIFT]; rC6_2 = pC2[6 SHIFT]; rC7_2 = pC2[7 SHIFT]; rC0_3 = *pC3; rC1_3 = pC3[1 SHIFT]; rC2_3 = pC3[2 SHIFT]; rC3_3 = pC3[3 SHIFT]; rC4_3 = pC3[4 SHIFT]; rC5_3 = pC3[5 SHIFT]; rC6_3 = pC3[6 SHIFT]; rC7_3 = pC3[7 SHIFT]; rC0_4 = *pC4; rC1_4 = pC4[1 SHIFT]; rC2_4 = pC4[2 SHIFT]; rC3_4 = pC4[3 SHIFT]; rC4_4 = pC4[4 SHIFT]; rC5_4 = pC4[5 SHIFT]; rC6_4 = pC4[6 SHIFT]; rC7_4 = pC4[7 SHIFT]; rC0_5 = *pC5; rC1_5 = pC5[1 SHIFT]; rC2_5 = pC5[2 SHIFT]; rC3_5 = pC5[3 SHIFT]; rC4_5 = pC5[4 SHIFT]; rC5_5 = pC5[5 SHIFT]; rC6_5 = pC5[6 SHIFT]; rC7_5 = pC5[7 SHIFT]; rC0_6 = *pC6; rC1_6 = pC6[1 SHIFT]; rC2_6 = pC6[2 SHIFT]; rC3_6 = pC6[3 SHIFT]; rC4_6 = pC6[4 SHIFT]; rC5_6 = pC6[5 SHIFT]; rC6_6 = pC6[6 SHIFT]; rC7_6 = pC6[7 SHIFT]; rC0_7 = *pC7; rC1_7 = pC7[1 SHIFT]; rC2_7 = pC7[2 SHIFT]; rC3_7 = pC7[3 SHIFT]; rC4_7 = pC7[4 SHIFT]; rC5_7 = pC7[5 SHIFT]; rC6_7 = pC7[6 SHIFT]; rC7_7 = pC7[7 SHIFT]; #ifdef BETAX rb7 = *bp; rC0_0 *= rb7; rC1_0 *= rb7; rC2_0 *= rb7; rC3_0 *= rb7; rC4_0 *= rb7; rC5_0 *= rb7; rC6_0 *= rb7; rC7_0 *= rb7; rC0_1 *= rb7; rC1_1 *= rb7; rC2_1 *= rb7; rC3_1 *= rb7; rC4_1 *= rb7; rC5_1 *= rb7; rC6_1 *= rb7; rC7_1 *= rb7; rC0_2 *= rb7; rC1_2 *= rb7; rC2_2 *= rb7; rC3_2 *= rb7; rC4_2 *= rb7; rC5_2 *= rb7; rC6_2 *= rb7; rC7_2 *= rb7; rC0_3 *= rb7; rC1_3 *= rb7; rC2_3 *= rb7; rC3_3 *= rb7; rC4_3 *= rb7; rC5_3 *= rb7; rC6_3 *= rb7; rC7_3 *= rb7; rC0_4 *= rb7; rC1_4 *= rb7; rC2_4 *= rb7; rC3_4 *= rb7; rC4_4 *= rb7; rC5_4 *= rb7; rC6_4 *= rb7; rC7_4 *= rb7; rC0_5 *= rb7; rC1_5 *= rb7; rC2_5 *= rb7; rC3_5 *= rb7; rC4_5 *= rb7; rC5_5 *= rb7; rC6_5 *= rb7; rC7_5 *= rb7; rC0_6 *= rb7; rC1_6 *= rb7; rC2_6 *= rb7; rC3_6 *= rb7; rC4_6 *= rb7; rC5_6 *= rb7; rC6_6 *= rb7; rC7_6 *= rb7; rC0_7 *= rb7; rC1_7 *= rb7; rC2_7 *= rb7; rC3_7 *= rb7; rC4_7 *= rb7; rC5_7 *= rb7; rC6_7 *= rb7; rC7_7 *= rb7; #endif #endif rA0 = *pA0++; rA1 = *pA1++; rA2 = *pA2++; rA3 = *pA3++; rA4 = *pA4++; rA5 = *pA5++; rA6 = *pA6++; rA7 = *pA7++; rB0 = *pB0++; rB1 = *pB1++; rB2 = *pB2++; rB3 = *pB3++; rB4 = *pB4++; rB5 = *pB5++; rB6 = *pB6++; rB7 = *pB7++; for (k=Kstart; k; k--) /* easy loop to unroll */ { rC0_0 += rA0 * rB0; ra0 = *pA0++; rC1_0 += rA1 * rB0; ra1 = *pA1++; rC2_0 += rA2 * rB0; ra2 = *pA2++; rC3_0 += rA3 * rB0; ra3 = *pA3++; rC4_0 += rA4 * rB0; ra4 = *pA4++; rC5_0 += rA5 * rB0; ra5 = *pA5++; rC6_0 += rA6 * rB0; ra6 = *pA6++; rC7_0 += rA7 * rB0; ra7 = *pA7++; rC0_1 += rA0 * rB1; rb0 = *pB0++; rC1_1 += rA1 * rB1; rb1 = *pB1++; rC2_1 += rA2 * rB1; rb2 = *pB2++; rC3_1 += rA3 * rB1; rb3 = *pB3++; rC4_1 += rA4 * rB1; rb4 = *pB4++; rC5_1 += rA5 * rB1; rb5 = *pB5++; rC6_1 += rA6 * rB1; rb6 = *pB6++; rC7_1 += rA7 * rB1; rb7 = *pB7++; rC0_2 += rA0 * rB2; rC1_2 += rA1 * rB2; rC2_2 += rA2 * rB2; rC3_2 += rA3 * rB2; rB0 = *pB0++; rC4_2 += rA4 * rB2; rB1 = *pB1++; rC5_2 += rA5 * rB2; rC6_2 += rA6 * rB2; rC7_2 += rA7 * rB2; rB2 = *pB2++; rC0_3 += rA0 * rB3; rC1_3 += rA1 * rB3; rC2_3 += rA2 * rB3; rC3_3 += rA3 * rB3; rC4_3 += rA4 * rB3; rC5_3 += rA5 * rB3; rC6_3 += rA6 * rB3; rC7_3 += rA7 * rB3; rB3 = *pB3++; rC0_4 += rA0 * rB4; rC1_4 += rA1 * rB4; rC2_4 += rA2 * rB4; rC3_4 += rA3 * rB4; rC4_4 += rA4 * rB4; rC5_4 += rA5 * rB4; rC6_4 += rA6 * rB4; rC7_4 += rA7 * rB4; rB4 = *pB4++; rC0_5 += rA0 * rB5; rC1_5 += rA1 * rB5; rC2_5 += rA2 * rB5; rC3_5 += rA3 * rB5; rC4_5 += rA4 * rB5; rC5_5 += rA5 * rB5; rC6_5 += rA6 * rB5; rC7_5 += rA7 * rB5; rB5 = *pB5++; rC0_6 += rA0 * rB6; rC1_6 += rA1 * rB6; rC2_6 += rA2 * rB6; rC3_6 += rA3 * rB6; rC4_6 += rA4 * rB6; rC5_6 += rA5 * rB6; rC6_6 += rA6 * rB6; rC7_6 += rA7 * rB6; rB6 = *pB6++; rC0_7 += rA0 * rB7; rC1_7 += rA1 * rB7; rC2_7 += rA2 * rB7; rC3_7 += rA3 * rB7; rC4_7 += rA4 * rB7; rC5_7 += rA5 * rB7; rC6_7 += rA6 * rB7; rC7_7 += rA7 * rB7; rB7 = *pB7++; rC0_0 += ra0 * rb0; rC1_0 += ra1 * rb0; rA0 = *pA0++; rC2_0 += ra2 * rb0; rA1 = *pA1++; rC3_0 += ra3 * rb0; rA2 = *pA2++; rC4_0 += ra4 * rb0; rA3 = *pA3++; rC5_0 += ra5 * rb0; rA4 = *pA4++; rC6_0 += ra6 * rb0; rA5 = *pA5++; rC7_0 += ra7 * rb0; rA6 = *pA6++; rC0_1 += ra0 * rb1; rA7 = *pA7++; rC1_1 += ra1 * rb1; rC2_1 += ra2 * rb1; rC3_1 += ra3 * rb1; rC4_1 += ra4 * rb1; rC5_1 += ra5 * rb1; rC6_1 += ra6 * rb1; rC7_1 += ra7 * rb1; rC0_2 += ra0 * rb2; rC1_2 += ra1 * rb2;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?