atl_smm6x1x120_sse.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,763 行 · 第 1/3 页

C
1,763
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2004 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifdef ATL_GAS_x8664   #define movl movq   #define addl addq   #define subl subq   #define esp  rsp   #define X8664#elif defined(ATL_GAS_x8632)   #define X8632#else   #error "This kernel requires a gas x86 assembler!"#endif#ifndef MB   #ifdef X8664      #define MB 0   #else      #error "MB must be compile-time constant!"   #endif#endif#ifndef NB   #define NB 0#endif#if !defined(KB) || KB != 120   #error "KB must be compile-time constant of 120!"#endif#ifdef ATL_GAS_x8632   #define pC0     %esi   #define pA0     %ecx   #define pA3     %eax   #define pB0     %edi   #define ldab    %edx   #define pfA     %ebp   #define stN     %bh   #define stM     %bl#else   #define pC0     %rsi   #define pA0     %rcx   #define pA3     %rax   #define pB0     %rbx   #define ldab    %rdx   #define pfA     %rbp   #define stM     %rdi   #define stN     %r8   #define MM      %r9   #define MBKB    %r10   #define incCn   %r11   #define rBETA   %xmm8   #define rc0     %xmm9   #define rc1     %xmm10   #define rtmp    %xmm11#endif#define rC0     %xmm0#define rC1     %xmm1#define rC2     %xmm2#define rC3     %xmm3#define rC4     %xmm4#define rC5     %xmm5#define rA0     %xmm6#define rB0     %xmm7#define NB6so   (6*KB*4)/* * Prefetch defines */#if 1   #define pref2(mem) prefetcht1   mem   #define prefB(mem) prefetcht0   mem   #if defined(ATL_ARCH_HAMMER32) || defined(ATL_ARCH_HAMMER64)      #define prefC(mem) prefetchw    mem   #else      #define prefC(mem) prefetcht0    mem   #endif#else   #define pref2(mem)   #define prefB(mem)   #define prefC(mem)#endif#ifdef SCPLX   #define CMUL(arg_) 2*arg_#else   #define CMUL(arg_) arg_#endif/*                      %rdi/4       %rsi/8       %rdx/12          %xmm0/16 void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha,                       %rcx/20         %r8/24         %r9/28             32                 const TYPE *A, const int lda, const TYPE *B, const int ldb,                        %xmm1/36    16/40          24/44                 const TYPE beta, TYPE *C, const int ldc)*/        .text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM):#ifdef X8632        subl    $28, %esp        movl    %ebp, 24(%esp)        movl    %ebx, 20(%esp)        movl    %esi, 16(%esp)        movl    %edi, 12(%esp)/* *      Store incCn = (ldc-MB)*sizeof and BETA to stack */        movl    72(%esp), %eax                                        movb %al, stM        subl    $MB, %eax#ifdef SCPLX        shl     $3, %eax#else        shl     $2, %eax#endif        movl    %eax, 8(%esp)   #ifdef BETAX        flds    64(%esp)        fstps   (%esp)      #define BETAOFF 0   #endif/* *      Initialize pA = A;  pB = B; pC = C; */        movl    68(%esp), pC0                                        prefC((pC0))                                        prefC(64(pC0))        movl    48(%esp), pA0        movl    56(%esp), pB0                                        prefB((pB0))                                        prefB(64(pB0))        addl    $120, pA0        addl    $120, pB0/* *      ldab = K * 4; */        movl    40(%esp), ldab        shl     $2, ldab/* *      pfA = pA + NBNB */        movl    pA0, pfA        addl    $MB*KB*4-120, pfA        movb    36(%esp), stN#else/* *      Save callee-saved iregs */        movq    %rbp, -8(%rsp)        movq    %rbx, -16(%rsp)/* *      Initilize beta */   #ifdef BETAX        unpcklps        %xmm1, %xmm1        movlhps         %xmm1, %xmm1        movaps  %xmm1, rBETA   #endif/* *      On entry, pA0 is correct, ldab has K, stM has M, pC0 has stN; MM has pB0 *      Set ldab = K*4, stN=N, and MBKB = M*K*4 */        shlq    $2, ldab        movq    pC0, stN   #if MB == 0        movq    stM, MBKB        imulq   ldab, MBKB   #else        movq    $MB*KB*4, MBKB   #endif/* *      Initialize incCn = (ldc-MB)*sizeof; pC0 = C; pB0 = B, pfA = pA0+MBKB */        movl    24(%rsp), %rsi        movq    %rsi, incCn        shlq    $32, incCn        sarq    $32, incCn        subq    stM, incCn   #ifdef SCPLX        shlq    $3, incCn   #else        shlq    $2, incCn   #endif        movq    16(%rsp), pC0        movq    MM, pB0        movq    stM, MM        subq    $6, MM        movq    pA0, pfA        addq    MBKB, pfA/* *      Bias ptrs to max 1-byte range */        addq    $120, pA0        addq    $120, pB0/* *      MBKB henceforth used to increment pA0, so subtract NB6so+240 from it */        subq    $NB6so-240, MBKB#endif                                        prefB((pA0))                                        prefB(64(pA0))        ALIGN8NLOOP:	lea	0(pA0, ldab,2), pA3	addl	ldab, pA3#if MB == 0 || MB > 6   #ifdef X8632      #ifdef ATL_DivAns	movb	$ATL_DivAns-1, stM      #else	movb	$MB/6-1, stM      #endif   #else        movq    MM, stM      #if MB == 0        cmp     $0, stM        je      MLOOPCU      #endif   #endif	ALIGN8MLOOP:#ifdef X8664   #if defined(BETAX)      #ifdef SCPLX        movups  (pC0), rc0        movups  16(pC0), rtmp        shufps  $0xE8, rc0, rc0        shufps  $0xE8, rtmp, rtmp        movlhps rtmp, rc0        movss   32(pC0), rc1        movss   40(pC0), rtmp        unpcklps        rtmp, rc1      #else        movups  (pC0), rc0        xorps   rc1, rc1        movlps  16(pC0), rc1      #endif        mulps   rBETA, rc0        mulps   rBETA, rc1   #endif        ALIGN16#elif defined(BETAX)        movss   (pC0), rC0        movss   CMUL(4)(pC0), rC1        movss   CMUL(8)(pC0), rC2        movss   CMUL(12)(pC0), rC3        movss   CMUL(16)(pC0), rC4        movss   CMUL(20)(pC0), rC5        movss   BETAOFF(%esp), rA0        mulss   rA0, rC0        mulss   rA0, rC1        mulss   rA0, rC2        mulss   rA0, rC3        mulss   rA0, rC4        mulss   rA0, rC5        ALIGN16#endif/*KLOOP */#if defined(BETA1)	movaps	0-120(pA0), rC0	movaps	0-120(pB0), rB0	mulps	rB0, rC0        addss   (pC0), rC0	movaps	0-120(pA0,ldab), rC1	mulps	rB0, rC1        addss   CMUL(4)(pC0), rC1	movaps	0-120(pA0,ldab,2), rC2	mulps	rB0, rC2        addss   CMUL(8)(pC0), rC2	movaps	0-120(pA3), rC3	mulps	rB0, rC3        addss   CMUL(12)(pC0), rC3	movaps	0-120(pA3,ldab), rC4	mulps	rB0, rC4        addss   CMUL(16)(pC0), rC4	movaps	0-120(pA3,ldab,2), rC5	mulps	rB0, rC5        addss   CMUL(20)(pC0), rC5#elif defined(BETA0) || defined(X8664)	movaps	0-120(pA0), rC0	movaps	0-120(pB0), rB0	mulps	rB0, rC0	movaps	0-120(pA0,ldab), rC1	mulps	rB0, rC1	movaps	0-120(pA0,ldab,2), rC2	mulps	rB0, rC2	movaps	0-120(pA3), rC3	mulps	rB0, rC3	movaps	0-120(pA3,ldab), rC4	mulps	rB0, rC4	movaps	0-120(pA3,ldab,2), rC5	mulps	rB0, rC5#else	movaps	0-120(pA0), rA0	movaps	0-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	0-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	0-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	0-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	0-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	0-120(pA3,ldab,2), rB0	addps	rB0, rC5#endif	movaps	16-120(pA0), rA0	movaps	16-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	16-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	16-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	16-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	16-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	16-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	32-120(pA0), rA0	movaps	32-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	32-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	32-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	32-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	32-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	32-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	48-120(pA0), rA0	movaps	48-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	48-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	48-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	48-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	48-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	48-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	64-120(pA0), rA0	movaps	64-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	64-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	64-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	64-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	64-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	64-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	80-120(pA0), rA0	movaps	80-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	80-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	80-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	80-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	80-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	80-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	96-120(pA0), rA0	movaps	96-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	96-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	96-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	96-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	96-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	96-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	112-120(pA0), rA0	movaps	112-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	112-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	112-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	112-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	112-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	112-120(pA3,ldab,2), rB0	addps	rB0, rC5                                                pref2((pfA))                                                addl    $25, pfA	movaps	128-120(pA0), rA0	movaps	128-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	128-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	128-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	128-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	128-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	128-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	144-120(pA0), rA0	movaps	144-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	144-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	144-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	144-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	144-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	144-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	160-120(pA0), rA0	movaps	160-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	160-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	160-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	160-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	160-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	160-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	176-120(pA0), rA0	movaps	176-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	176-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	176-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	176-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	176-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	176-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	192-120(pA0), rA0	movaps	192-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	192-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	192-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	192-120(pA3), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	192-120(pA3,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	mulps	192-120(pA3,ldab,2), rB0	addps	rB0, rC5	movaps	208-120(pA0), rA0	movaps	208-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	208-120(pA0,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	208-120(pA0,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	208-120(pA3), rA0	mulps	rB0, rA0

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?