⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_smm6x1x60_x87.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 5 页
字号:
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* * The basic outline of this file came from the x87 kernel I wrote for the * Hammer processor.  However, the key to good athlon performance comes from * instruction alignment, and I got this key from Julian Ruhe's explanation * of athlon optimization tips. */#ifndef ATL_GAS_x8632   #error "This kernel requires a gas x86 assembler!"#endif#define BOFF 120#if !defined(MB) || (MB == 0)   #error "MB must be a compile-time constant!"#endif#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if (KB > 60)   #error "KB must less than 61!"#endif#if (MB/6)*6 != MB   #error "MB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8632   #define pC0     %esi   #define pA0     %ecx   #define pA1     %eax   #define pB0     %edi   #define ldab    %edx   #define pfA     %ebp   #define stN     %bh/*   #define stM     %bl *//* lower 16 bits of %ebx used for M & N loop counters *//* incCn overwrites pA1 */#endif/* * Prefetch defines */#if defined(ATL_SSE1) || defined(ATL_SSE2)   #define pref2(mem) prefetcht1   mem   #define prefB(mem) prefetcht0   mem   #ifdef ATL_3DNow      #define prefC(mem) prefetchw  mem   #else      #define prefC(mem) prefetchnta  mem   #endif#elif defined(ATL_3DNow)   #define pref2(mem) prefetch   mem   #define prefB(mem) prefetch   mem   #define prefC(mem) prefetchw  mem#else   #define pref2(mem)   #define prefB(mem)   #define prefC(mem)#endif#ifdef SCPLX   #define CMUL(arg_) (2*(arg_))#else   #define CMUL(arg_) arg_#endif/*                           4            8           12                16 void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha,                            20             24             28             32                 const TYPE *A, const int lda, const TYPE *B, const int ldb,                              36       40             44                 const TYPE beta, TYPE *C, const int ldc)*/.text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM):        subl    $28, %esp        movl    %ebp, 24(%esp)        movl    %ebx, 20(%esp)        movl    %esi, 16(%esp)        movl    %edi, 12(%esp)/* *      Store incCn = (ldc-NB)*sizeof and BETA to stack */        movl    72(%esp), %eax        subl    $MB-6, %eax#ifdef SCPLX        shl     $3, %eax#else        shl     $2, %eax#endif        movl    %eax, 8(%esp)   #ifdef BETAX        movl    64(%esp), %eax        movl    %eax, (%esp)      #define BETAOFF 0   #endif/* *      Initialize pA = A;  pB = B; pC = C; */        movl    68(%esp), pC0                                        prefC((pC0))                                        prefC(64(pC0))        movl    48(%esp), pA0        movl    56(%esp), pB0        addl    $BOFF, pA0        addl    $BOFF, pB0/* *      ldab = K * 8; */        movl    40(%esp), ldab        shl     $2, ldab        movl    $KB*4, ldab/* *      pfA = pA + NBNB */        movl    pA0, pfA        addl    $MB*KB*4, pfA                                        prefB((pB0))                                        prefB(64(pB0))        movb    36(%esp), stN        lea     0(pA0, ldab), pA1        ALIGN16NLOOP:/*        movb    $MB/6-1, stM */#if (MB > 6)        ALIGN16/*MLOOP: *//* *Load C, apply beta.  Stack will be: * st(0)  temp * st(1)  temp * st(2)  pC[0] * st(3)  pC[1] * st(4)  pC[2] * st(5)  pC[3] * st(6)  pC[4] * st(7)  pC[5] *//*KLOOP: */#ifdef BETA0	flds	0-BOFF(pB0)	flds    0-BOFF(pA1,ldab,4)	fmul	%st(1), %st        fxch	flds	0-BOFF(pA0,ldab,4)	fmul	%st(1), %st        ALIGN8        fxch	flds	0-BOFF(pA1,ldab,2)	fmul	%st(1), %st        ALIGN8        fxch	flds	0-BOFF(pA0,ldab,2)	fmul	%st(1), %st        ALIGN8        fxch	flds	0-BOFF(pA0,ldab)	fmul	%st(1), %st        ALIGN8        fxch	fmuls	0-BOFF(pA0)        ALIGN8#elif defined(BETA1)	flds	0-BOFF(pB0)	flds	0-BOFF(pA0,ldab,4)	fmul	%st(1), %st        fadds   CMUL(16)(pC0)	flds	0-BOFF(pA1,ldab,2)	fmul	%st(2), %st        fadds   CMUL(12)(pC0)	flds	0-BOFF(pA0,ldab,2)	fmul	%st(3), %st        fadds   CMUL(8)(pC0)	flds	0-BOFF(pA0,ldab)	fmul	%st(4), %st        fadds   CMUL(4)(pC0)	flds	0-BOFF(pA0)	fmul	%st(5), %st        fadds   0(pC0)	flds    0-BOFF(pA1,ldab,4)	fmul	%st(6), %st        fadds   CMUL(20)(pC0)        fstp    %st(6)#else        flds    BETAOFF(%esp)        flds    CMUL(16)(pC0)        fmul    %st(1), %st        flds    CMUL(12)(pC0)        fmul    %st(2), %st        flds    CMUL(8)(pC0)        fmul    %st(3), %st        flds    CMUL(4)(pC0)        fmul    %st(4), %st        ALIGN8        flds    0(pC0)        fmul    %st(5), %st        flds    CMUL(20)(pC0)        fmul    %st(6), %st        fxch    %st(6)        fstp    %st        ALIGN8	flds	0-BOFF(pB0)	flds	0-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	0-BOFF(pA0,ldab)	fmul	%st(1), %st        ALIGN8	faddp	%st, %st(3)	flds	0-BOFF(pA0,ldab,2)	fmul	%st(1), %st        ALIGN8	faddp	%st, %st(4)	flds	0-BOFF(pA1,ldab,2)	fmul	%st(1), %st        ALIGN8	faddp	%st, %st(5)	flds	0-BOFF(pA0,ldab,4)	fmul	%st(1), %st        ALIGN8	faddp	%st, %st(6)	fmuls   0-BOFF(pA1,ldab,4)	faddp	%st, %st(6)        ALIGN8#endif#if (KB > 1)	flds	4-BOFF(pB0)	flds	4-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	4-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	4-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	4-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	4-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	4-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 2)	flds	8-BOFF(pB0)	flds	8-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	8-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	8-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	8-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	8-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	8-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 3)	flds	12-BOFF(pB0)	flds	12-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	12-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	12-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	12-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	12-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	12-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 4)	flds	16-BOFF(pB0)	flds	16-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	16-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	16-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	16-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	16-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	16-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 5)	flds	20-BOFF(pB0)	flds	20-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	20-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	20-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	20-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	20-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	20-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 6)	flds	24-BOFF(pB0)	flds	24-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	24-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	24-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	24-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	24-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	24-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 7)	flds	28-BOFF(pB0)	flds	28-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	28-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	28-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	28-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	28-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	28-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 8)	flds	32-BOFF(pB0)	flds	32-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	32-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	32-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	32-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	32-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	32-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 9)	flds	36-BOFF(pB0)	flds	36-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	36-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	36-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	36-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	36-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	36-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 10)	flds	40-BOFF(pB0)	flds	40-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	40-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	40-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	40-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	40-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	40-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 11)	flds	44-BOFF(pB0)	flds	44-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	44-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	44-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	44-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	44-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	44-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 12)	flds	48-BOFF(pB0)	flds	48-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	48-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	48-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	48-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	48-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	48-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 13)	flds	52-BOFF(pB0)	flds	52-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	52-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	52-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	52-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	52-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	52-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 14)	flds	56-BOFF(pB0)	flds	56-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	56-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	56-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	56-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	56-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	56-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 15)	flds	60-BOFF(pB0)	flds	60-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	60-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	60-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	60-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	60-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	60-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 16)	flds	64-BOFF(pB0)	flds	64-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	64-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	64-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	64-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	64-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	64-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 17)	flds	68-BOFF(pB0)	flds	68-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	68-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	68-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	68-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	68-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	68-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 18)	flds	72-BOFF(pB0)	flds	72-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	72-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	72-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	72-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	72-BOFF(pA0,ldab,4)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(6)	fmuls	72-BOFF(pA1,ldab,4)	faddp	%st, %st(6)	ALIGN8#endif#if (KB > 19)	flds	76-BOFF(pB0)	flds	76-BOFF(pA0)	fmul	%st(1), %st	faddp	%st, %st(2)	flds	76-BOFF(pA1)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(3)	flds	76-BOFF(pA0,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(4)	flds	76-BOFF(pA1,ldab,2)	fmul	%st(1), %st	ALIGN8	faddp	%st, %st(5)	flds	76-BOFF(pA0,ldab,4)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -