atl_dmm6x1x60pabc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,694 行 · 第 1/5 页
C
3,694 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define rsp esp#elif !defined(ATL_GAS_x8664) #error "This kernel requires a gas x86 assembler!"#endif#if !defined(NB) || (NB == 0) #error "NB must be a compile-time constant!"#endif#if (NB != 60) #error "NB must be 60!"#endif#if (NB/6)*6 != NB #error "NB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8632 #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %edx #define stN %ebx #define pfA %ebp#else #define pC %rsi #define pA %rcx #define pB %rdi #define incCn %rax #define stM %rdx #define stN %rbx #define pfA %rbp /* rax used in 32/64 conversion */#endif#define NBso (NB*8)#define NBNBso (NB*NB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem/* *void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, * const TYPE *A, const int lda, const TYPE *B, const int ldb, * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):#ifdef ATL_GAS_x8632/* * Save callee-saved iregs */ subl $24, %esp movl %ebp, 20(%esp) movl %ebx, 16(%esp) movl %esi, 12(%esp) movl %edi, 8(%esp) #ifdef BETAX fldl 64(%esp) fstpl (%esp) #define BETAOFF 0 #endif/* * Initialize pA = A; pB = B; pC = C; */ movl 48(%esp), pA movl 56(%esp), pB movl 72(%esp), pC prefC((pC)) prefC(64(pC))/* * stM = pA + NBNB-6*NB; pfA = pA+NBNB; stN = pB + NBNB; */ movl $NBNBso-NB6so, stM addl pA, stM movl stM, pfA addl $NB6so, pfA movl $NBNBso, stN addl pB, stN/* * Set incCn = (ldc - NB)*sizeof */ movl 76(%esp), incCn prefB((pB)) prefB(64(pB)) subl $(MB-6), incCn shl $3, incCn#else/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp) #ifdef BETAX movsd %xmm1, -24(%rsp) #define BETAOFF -24 #endif/* * pA already comes in right reg * Initialize pB = B; pC = C; NBso = NB * sizeof; */ movq %r9, pB movq 16(%rsp), pC prefC((pC)) prefC(64(pC))/* * stM = pA + NBNBso; stN = pB + NBNBso; */ movq $NBNBso-NB6so, stM addq pA, stM movq stM, pfA addq $NB6so, pfA movq $NBNBso, stN addq pB, stN/* * convert ldc to 64 bits, and then set incCn = (ldc - NB)*sizeof */ movl 24(%rsp), %eax cltq prefB((pB)) prefB(64(pB))/* movq %rax, incCn */ subq $NB, incCn shl $3, incCn addq $48, incCn#endifNLOOP:/* * stK = pB + NBso *//* movq pB, stK *//* addq $NBso, stK */MLOOP:/* *Load C, apply beta. Stack will be: * st(0) temp * st(1) temp * st(2) pC[0] * st(3) pC[1] * st(4) pC[2] * st(5) pC[3] * st(6) pC[4] * st(7) pC[5] */#ifdef BETA0 fldz fldz fldz fldz fldz fldz#else fldl 40(pC) fldl 32(pC) fldl 24(pC) fldl 16(pC) fldl 8(pC) fldl (pC) #ifdef BETAX fldl BETAOFF(%rsp) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmul %st, %st(4) fmul %st, %st(5) fmulp %st, %st(6) #endif#endif ALIGN16/*KLOOP: */ pref2((pfA)) fldl (pB) fldl (pA) fmul %st(1), %st faddp %st, %st(2) fldl NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 8(pB) fldl 8(pA) fmul %st(1), %st faddp %st, %st(2) fldl 8+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 8+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 8+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 8+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 8+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) pref2(64(pfA)) fldl 16(pB) fldl 16(pA) fmul %st(1), %st faddp %st, %st(2) fldl 16+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 16+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 16+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 16+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 16+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 24(pB) fldl 24(pA) fmul %st(1), %st faddp %st, %st(2) fldl 24+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 24+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 24+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 24+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 24+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) pref2(128(pfA)) addq $160, pfA fldl 32(pB) fldl 32(pA) fmul %st(1), %st faddp %st, %st(2) fldl 32+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 32+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 32+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 32+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 32+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 40(pB) fldl 40(pA) fmul %st(1), %st faddp %st, %st(2) fldl 40+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 40+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 40+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 40+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 40+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 48(pB) fldl 48(pA) fmul %st(1), %st faddp %st, %st(2) fldl 48+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 48+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 48+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 48+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 48+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 56(pB) fldl 56(pA) fmul %st(1), %st faddp %st, %st(2) fldl 56+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 56+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 56+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 56+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 56+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 64(pB) fldl 64(pA) fmul %st(1), %st faddp %st, %st(2) fldl 64+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 64+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 64+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 64+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 64+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 72(pB) fldl 72(pA) fmul %st(1), %st faddp %st, %st(2) fldl 72+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 72+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 72+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 72+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 72+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 80(pB) fldl 80(pA) fmul %st(1), %st faddp %st, %st(2) fldl 80+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 80+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 80+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 80+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 80+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 88(pB) fldl 88(pA) fmul %st(1), %st faddp %st, %st(2) fldl 88+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 88+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 88+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 88+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 88+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 96(pB) fldl 96(pA) fmul %st(1), %st faddp %st, %st(2) fldl 96+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 96+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 96+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 96+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 96+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 104(pB) fldl 104(pA) fmul %st(1), %st faddp %st, %st(2) fldl 104+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 104+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 104+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 104+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 104+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 112(pB) fldl 112(pA) fmul %st(1), %st faddp %st, %st(2) fldl 112+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 112+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 112+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 112+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 112+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 120(pB) fldl 120(pA) fmul %st(1), %st faddp %st, %st(2) fldl 120+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 120+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 120+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 120+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 120+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 128(pB) fldl 128(pA) fmul %st(1), %st faddp %st, %st(2) fldl 128+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 128+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 128+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 128+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 128+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 136(pB) fldl 136(pA) fmul %st(1), %st faddp %st, %st(2) fldl 136+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 136+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 136+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 136+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 136+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 144(pB) fldl 144(pA) fmul %st(1), %st faddp %st, %st(2) fldl 144+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 144+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 144+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 144+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 144+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 152(pB) fldl 152(pA) fmul %st(1), %st faddp %st, %st(2) fldl 152+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 152+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 152+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 152+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 152+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 160(pB) fldl 160(pA) fmul %st(1), %st faddp %st, %st(2) fldl 160+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 160+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 160+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 160+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 160+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 168(pB) fldl 168(pA) fmul %st(1), %st faddp %st, %st(2) fldl 168+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 168+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 168+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 168+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 168+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 176(pB) fldl 176(pA) fmul %st(1), %st faddp %st, %st(2) fldl 176+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 176+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 176+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 176+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 176+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 184(pB) fldl 184(pA) fmul %st(1), %st faddp %st, %st(2) fldl 184+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 184+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 184+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 184+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 184+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 192(pB) fldl 192(pA) fmul %st(1), %st faddp %st, %st(2) fldl 192+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 192+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 192+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 192+NB4so(pA) fmul %st(1), %st faddp %st, %st(6) fldl 192+NB5so(pA) fmulp %st, %st(1) faddp %st, %st(6) fldl 200(pB) fldl 200(pA) fmul %st(1), %st faddp %st, %st(2) fldl 200+NBso(pA) fmul %st(1), %st faddp %st, %st(3) fldl 200+NB2so(pA) fmul %st(1), %st faddp %st, %st(4) fldl 200+NB3so(pA) fmul %st(1), %st faddp %st, %st(5) fldl 200+NB4so(pA) fmul %st(1), %st
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?