📄 atl_smm6x1x60_x87.c
字号:
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* * The basic outline of this file came from the x87 kernel I wrote for the * Hammer processor. However, the key to good athlon performance comes from * instruction alignment, and I got this key from Julian Ruhe's explanation * of athlon optimization tips. */#ifndef ATL_GAS_x8632 #error "This kernel requires a gas x86 assembler!"#endif#define BOFF 120#if !defined(MB) || (MB == 0) #error "MB must be a compile-time constant!"#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if (KB > 60) #error "KB must less than 61!"#endif#if (MB/6)*6 != MB #error "MB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8632 #define pC0 %esi #define pA0 %ecx #define pA1 %eax #define pB0 %edi #define ldab %edx #define pfA %ebp #define stN %bh/* #define stM %bl *//* lower 16 bits of %ebx used for M & N loop counters *//* incCn overwrites pA1 */#endif/* * Prefetch defines */#if defined(ATL_SSE1) || defined(ATL_SSE2) #define pref2(mem) prefetcht1 mem #define prefB(mem) prefetcht0 mem #ifdef ATL_3DNow #define prefC(mem) prefetchw mem #else #define prefC(mem) prefetchnta mem #endif#elif defined(ATL_3DNow) #define pref2(mem) prefetch mem #define prefB(mem) prefetch mem #define prefC(mem) prefetchw mem#else #define pref2(mem) #define prefB(mem) #define prefC(mem)#endif#ifdef SCPLX #define CMUL(arg_) (2*(arg_))#else #define CMUL(arg_) arg_#endif/* 4 8 12 16 void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, 20 24 28 32 const TYPE *A, const int lda, const TYPE *B, const int ldb, 36 40 44 const TYPE beta, TYPE *C, const int ldc)*/.text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM): subl $28, %esp movl %ebp, 24(%esp) movl %ebx, 20(%esp) movl %esi, 16(%esp) movl %edi, 12(%esp)/* * Store incCn = (ldc-NB)*sizeof and BETA to stack */ movl 72(%esp), %eax subl $MB-6, %eax#ifdef SCPLX shl $3, %eax#else shl $2, %eax#endif movl %eax, 8(%esp) #ifdef BETAX movl 64(%esp), %eax movl %eax, (%esp) #define BETAOFF 0 #endif/* * Initialize pA = A; pB = B; pC = C; */ movl 68(%esp), pC0 prefC((pC0)) prefC(64(pC0)) movl 48(%esp), pA0 movl 56(%esp), pB0 addl $BOFF, pA0 addl $BOFF, pB0/* * ldab = K * 8; */ movl 40(%esp), ldab shl $2, ldab movl $KB*4, ldab/* * pfA = pA + NBNB */ movl pA0, pfA addl $MB*KB*4, pfA prefB((pB0)) prefB(64(pB0)) movb 36(%esp), stN lea 0(pA0, ldab), pA1 ALIGN16NLOOP:/* movb $MB/6-1, stM */#if (MB > 6) ALIGN16/*MLOOP: *//* *Load C, apply beta. Stack will be: * st(0) temp * st(1) temp * st(2) pC[0] * st(3) pC[1] * st(4) pC[2] * st(5) pC[3] * st(6) pC[4] * st(7) pC[5] *//*KLOOP: */#ifdef BETA0 flds 0-BOFF(pB0) flds 0-BOFF(pA1,ldab,4) fmul %st(1), %st fxch flds 0-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 fxch flds 0-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 fxch flds 0-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 fxch flds 0-BOFF(pA0,ldab) fmul %st(1), %st ALIGN8 fxch fmuls 0-BOFF(pA0) ALIGN8#elif defined(BETA1) flds 0-BOFF(pB0) flds 0-BOFF(pA0,ldab,4) fmul %st(1), %st fadds CMUL(16)(pC0) flds 0-BOFF(pA1,ldab,2) fmul %st(2), %st fadds CMUL(12)(pC0) flds 0-BOFF(pA0,ldab,2) fmul %st(3), %st fadds CMUL(8)(pC0) flds 0-BOFF(pA0,ldab) fmul %st(4), %st fadds CMUL(4)(pC0) flds 0-BOFF(pA0) fmul %st(5), %st fadds 0(pC0) flds 0-BOFF(pA1,ldab,4) fmul %st(6), %st fadds CMUL(20)(pC0) fstp %st(6)#else flds BETAOFF(%esp) flds CMUL(16)(pC0) fmul %st(1), %st flds CMUL(12)(pC0) fmul %st(2), %st flds CMUL(8)(pC0) fmul %st(3), %st flds CMUL(4)(pC0) fmul %st(4), %st ALIGN8 flds 0(pC0) fmul %st(5), %st flds CMUL(20)(pC0) fmul %st(6), %st fxch %st(6) fstp %st ALIGN8 flds 0-BOFF(pB0) flds 0-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 0-BOFF(pA0,ldab) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 0-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 0-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 0-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 0-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 1) flds 4-BOFF(pB0) flds 4-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 4-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 4-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 4-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 4-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 4-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 2) flds 8-BOFF(pB0) flds 8-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 8-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 8-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 8-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 8-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 8-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 3) flds 12-BOFF(pB0) flds 12-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 12-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 12-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 12-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 12-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 12-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 4) flds 16-BOFF(pB0) flds 16-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 16-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 16-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 16-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 16-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 16-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 5) flds 20-BOFF(pB0) flds 20-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 20-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 20-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 20-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 20-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 20-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 6) flds 24-BOFF(pB0) flds 24-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 24-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 24-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 24-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 24-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 24-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 7) flds 28-BOFF(pB0) flds 28-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 28-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 28-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 28-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 28-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 28-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 8) flds 32-BOFF(pB0) flds 32-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 32-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 32-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 32-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 32-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 32-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 9) flds 36-BOFF(pB0) flds 36-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 36-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 36-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 36-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 36-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 36-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 10) flds 40-BOFF(pB0) flds 40-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 40-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 40-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 40-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 40-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 40-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 11) flds 44-BOFF(pB0) flds 44-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 44-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 44-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 44-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 44-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 44-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 12) flds 48-BOFF(pB0) flds 48-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 48-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 48-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 48-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 48-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 48-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 13) flds 52-BOFF(pB0) flds 52-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 52-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 52-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 52-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 52-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 52-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 14) flds 56-BOFF(pB0) flds 56-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 56-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 56-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 56-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 56-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 56-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 15) flds 60-BOFF(pB0) flds 60-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 60-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 60-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 60-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 60-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 60-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 16) flds 64-BOFF(pB0) flds 64-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 64-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 64-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 64-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 64-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 64-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 17) flds 68-BOFF(pB0) flds 68-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 68-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 68-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 68-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 68-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 68-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 18) flds 72-BOFF(pB0) flds 72-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 72-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 72-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 72-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 72-BOFF(pA0,ldab,4) fmul %st(1), %st ALIGN8 faddp %st, %st(6) fmuls 72-BOFF(pA1,ldab,4) faddp %st, %st(6) ALIGN8#endif#if (KB > 19) flds 76-BOFF(pB0) flds 76-BOFF(pA0) fmul %st(1), %st faddp %st, %st(2) flds 76-BOFF(pA1) fmul %st(1), %st ALIGN8 faddp %st, %st(3) flds 76-BOFF(pA0,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(4) flds 76-BOFF(pA1,ldab,2) fmul %st(1), %st ALIGN8 faddp %st, %st(5) flds 76-BOFF(pA0,ldab,4)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -