atl_dmm4x1x44_4_sse2.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 672 行

C
672
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2004 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* *Efficeon-optimized 4x1x44 DGEMM.  Pipelined to 4 (4 accumulators). *Prefetches the next col of B, and a col from the next block of A in the M-loop *Purposely kept small so it is retained in cache, and easy to translate when *not */#ifndef ATL_GAS_x8632   #error "This kernel requires gas x86-32 assembler!"#endif#if KB != 44   #error "KB must be 44!"#endif#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if !defined(NB)   #define NB 0#endif#if !defined(MB)   #define MB 0#endif#if (MB/4)*4 != MB   #error "MB must be multiple of 4!"#endif#ifdef DCPLX   #define OFF 16   #define CMUL(i_) (2*(i_))#else   #define OFF 8   #define CMUL(i_) i_#endif/* * Integer register usage shown be these defines */#define pC      %esi#define pA      %ebp#define pB      %edi#define incCn   %eax#define stM	%bl#define stN	%bh#define pfB  	%edx#define pfA     %ecx#define pA0	pA#define pB0	pB#define m0      %xmm0#define m1      %xmm1#define m2      %xmm2#define m3      %xmm3#define rC0     %xmm4#define rC1     %xmm5#define rC2     %xmm6#define rC3     %xmm7#define NB0so   0#define NBso	(KB*8)#define NB1so	(KB*8)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so   (NB6so+NBso)#define NB8so   (NB6so+NB2so)#define NB9so   (NB6so+NB3so)#define NB10so   (NB6so+NB4so)#define NB11so   (NB6so+NB5so)#if MB != 0   #define MBKBso  (MB*KB*8)#endif/* * Prefetch defines */#if 1   #define pref2(mem) prefetcht0	mem   #define prefB(mem) prefetcht0	mem   #define prefC(mem) prefetcht0	mem#else   #define pref2(mem)   #define prefB(mem)   #define prefC(mem)#endif/*offset                    4            8           12                16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *offset                     24             28             32            36 *                const TYPE *A, const int lda, const TYPE *B, const int ldb, *offset                       40       48             52 *                const TYPE beta, TYPE *C, const int ldc) */	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *	Save callee-saved iregs; Save old stack pointer in eax, *      so we can adjust for BETA alignment */#ifdef BETAX        movl    %esp, %eax        subl    $48, %esp        andl    $0xFFFFFFF0, %esp   #define BETAOFF   #define COFF 36   #define MOFF 40   #define MKOFF 44	movl	%ebp, 32(%esp)	movl	%ebx, 28(%esp)	movl	%esi, 24(%esp)	movl	%edi, 20(%esp)	movl	%eax, 16(%esp)        movlpd  40(%eax), rC0        unpcklpd        rC0, rC0        movapd  rC0, BETAOFF(%esp)#else   #define FSIZE 28   #define BETAOFF FSIZE+40   #define COFF 16   #define MOFF FSIZE+4   #define MKOFF COFF+8	subl	$FSIZE, %esp	movl	%ebp, 12(%esp)	movl	%ebx,  8(%esp)	movl	%esi,  4(%esp)	movl	%edi,   (%esp)#endif/* *      Initialize pA = A;  pB = B; pC = C; */#if MB == 0   #ifdef BETAX        movl    4(%eax), %ebx        movl    %ebx, MOFF(%esp)   #else        movl    MOFF(%esp), %ebx   #endif        imul    $NBso, %ebx        movl    %ebx, MKOFF(%esp)#endif#ifdef BETAX	movl	24(%eax), pA	movl	32(%eax), pB	movl	48(%eax), pC   #if NB == 0        movb    8(%eax), stN   #else        movb    $NB, stN   #endif	movl	52(%eax), incCn#else	movl	FSIZE+24(%esp), pA	movl	FSIZE+32(%esp), pB	movl	FSIZE+48(%esp), pC   #if NB == 0        movb    FSIZE+8(%esp), stN   #else        movb    $NB, stN   #endif	movl	FSIZE+52(%esp), incCn#endif/* *      Set incCn = (ldc - MB)*sizeof */   #if MB == 0        subl    MOFF(%esp), incCn   #else	subl	$MB, incCn   #endif   #ifdef DCPLX	shl	$4, incCn   #else	shl	$3, incCn   #endif/*   	movl	incCn, COFF(%esp) */        movl    pA0, pfA#if MB == 0        addl    MKOFF(%esp), pfA#else        addl    $MBKBso, pfA#endif        addl    $120, pA0        addl    $120, pB0NLOOP:#if MB == 0        movb    MOFF(%esp), stM#else        movb     $MB, stM#endif        lea     120+NBso(pB0), pfB#if MB != -5MLOOP:/* *      Unrolled & pipelined K-loop */#ifdef BETA1   #ifdef DCPLX	movsd   (pC), rC0	movsd   16(pC), rC1	movapd	0-120(pB0), rC3	movapd	0-120(pA0), m0	movapd	NBso+0-120(pA0), m1	movapd	NB2so+0-120(pA0), m2	mulpd	rC3, m0	mulpd	rC3, m1	mulpd	rC3, m2        movsd   32(pC), rC2   #else	movsd   (pC), rC0	movsd   16(pC), rC2	movapd	0-120(pB0), rC3	movapd	0-120(pA0), m0	movapd	NBso+0-120(pA0), m1	movapd	NB2so+0-120(pA0), m2	mulpd	rC3, m0	mulpd	rC3, m1	mulpd	rC3, m2        movsd   8(pC), rC1   #endif	mulpd	NB3so+0-120(pA0), rC3	addpd	m0, rC0	movapd	16-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+16-120(pA0), m1	mulpd	m0, m1        addpd   m2, rC2	movapd	NB1so+16-120(pA0), m2	mulpd	m0, m2	movapd	NB2so+16-120(pA0), m3	mulpd	m0, m3#else        nop        nop        nop        nop        movapd  0-120(pB0), rC3        movapd  16-120(pB0), m0        ALIGN4        movapd  0-120(pA0), rC0        movapd  NBso+0-120(pA0), rC1        movapd  NB2so+0-120(pA0), rC2        mulpd   rC3, rC0        mulpd   rC3, rC1        mulpd   rC3, rC2        mulpd   NB3so+0-120(pA0), rC3        movapd  NB0so+16-120(pA0), m1        mulpd   m0, m1        movapd  NB1so+16-120(pA0), m2        mulpd   m0, m2        movapd  NB2so+16-120(pA0), m3        mulpd   m0, m3#endif	mulpd	NB3so+16-120(pA0), m0	addpd	m1, rC0	movapd	32-120(pB0), m1	addpd	m2, rC1	movapd	NB0so+32-120(pA0), m2	mulpd	m1, m2	addpd	m3, rC2	movapd	NB1so+32-120(pA0), m3	mulpd	m1, m3	addpd	m0, rC3	movapd	NB2so+32-120(pA0), m0	mulpd	m1, m0   #ifdef BETA100        addsd   CMUL(16)(pC), rC2   #endif	mulpd	NB3so+32-120(pA0), m1	addpd	m2, rC0	movapd	48-120(pB0), m2	addpd	m3, rC1	movapd	NB0so+48-120(pA0), m3	mulpd	m2, m3	addpd	m0, rC2	movapd	NB1so+48-120(pA0), m0	mulpd	m2, m0	addpd	m1, rC3	movapd	NB2so+48-120(pA0), m1	mulpd	m2, m1	mulpd	NB3so+48-120(pA0), m2	addpd	m3, rC0	movapd	64-120(pB0), m3	addpd	m0, rC1	movapd	NB0so+64-120(pA0), m0	mulpd	m3, m0	addpd	m1, rC2	movapd	NB1so+64-120(pA0), m1	mulpd	m3, m1	addpd	m2, rC3	movapd	NB2so+64-120(pA0), m2	mulpd	m3, m2	mulpd	NB3so+64-120(pA0), m3	addpd	m0, rC0	movapd	80-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+80-120(pA0), m1	mulpd	m0, m1	addpd	m2, rC2	movapd	NB1so+80-120(pA0), m2	mulpd	m0, m2	addpd	m3, rC3	movapd	NB2so+80-120(pA0), m3	mulpd	m0, m3	mulpd	NB3so+80-120(pA0), m0	addpd	m1, rC0	movapd	96-120(pB0), m1	addpd	m2, rC1	movapd	NB0so+96-120(pA0), m2	mulpd	m1, m2	addpd	m3, rC2	movapd	NB1so+96-120(pA0), m3	mulpd	m1, m3	addpd	m0, rC3	movapd	NB2so+96-120(pA0), m0	mulpd	m1, m0	mulpd	NB3so+96-120(pA0), m1	addpd	m2, rC0	movapd	112-120(pB0), m2	addpd	m3, rC1	movapd	NB0so+112-120(pA0), m3	mulpd	m2, m3	addpd	m0, rC2	movapd	NB1so+112-120(pA0), m0	mulpd	m2, m0	addpd	m1, rC3	movapd	NB2so+112-120(pA0), m1	mulpd	m2, m1	mulpd	NB3so+112-120(pA0), m2	addpd	m3, rC0	movapd	128-120(pB0), m3	addpd	m0, rC1	movapd	NB0so+128-120(pA0), m0	mulpd	m3, m0	addpd	m1, rC2	movapd	NB1so+128-120(pA0), m1	mulpd	m3, m1	addpd	m2, rC3	movapd	NB2so+128-120(pA0), m2	mulpd	m3, m2	mulpd	NB3so+128-120(pA0), m3	addpd	m0, rC0	movapd	144-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+144-120(pA0), m1	mulpd	m0, m1	addpd	m2, rC2	movapd	NB1so+144-120(pA0), m2	mulpd	m0, m2	addpd	m3, rC3	movapd	NB2so+144-120(pA0), m3	mulpd	m0, m3	mulpd	NB3so+144-120(pA0), m0	addpd	m1, rC0	movapd	160-120(pB0), m1	addpd	m2, rC1	movapd	NB0so+160-120(pA0), m2	mulpd	m1, m2	addpd	m3, rC2	movapd	NB1so+160-120(pA0), m3	mulpd	m1, m3	addpd	m0, rC3	movapd	NB2so+160-120(pA0), m0	mulpd	m1, m0	mulpd	NB3so+160-120(pA0), m1	addpd	m2, rC0	movapd	176-120(pB0), m2	addpd	m3, rC1	movapd	NB0so+176-120(pA0), m3	mulpd	m2, m3	addpd	m0, rC2	movapd	NB1so+176-120(pA0), m0	mulpd	m2, m0	addpd	m1, rC3	movapd	NB2so+176-120(pA0), m1	mulpd	m2, m1	mulpd	NB3so+176-120(pA0), m2	addpd	m3, rC0	movapd	192-120(pB0), m3	addpd	m0, rC1	movapd	NB0so+192-120(pA0), m0	mulpd	m3, m0	addpd	m1, rC2	movapd	NB1so+192-120(pA0), m1	mulpd	m3, m1	addpd	m2, rC3	movapd	NB2so+192-120(pA0), m2	mulpd	m3, m2	mulpd	NB3so+192-120(pA0), m3	addpd	m0, rC0	movapd	208-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+208-120(pA0), m1	mulpd	m0, m1	addpd	m2, rC2	movapd	NB1so+208-120(pA0), m2	mulpd	m0, m2	addpd	m3, rC3	movapd	NB2so+208-120(pA0), m3	mulpd	m0, m3	mulpd	NB3so+208-120(pA0), m0	addpd	m1, rC0	movapd	224-120(pB0), m1	addpd	m2, rC1	movapd	NB0so+224-120(pA0), m2	mulpd	m1, m2	addpd	m3, rC2	movapd	NB1so+224-120(pA0), m3	mulpd	m1, m3	addpd	m0, rC3	movapd	NB2so+224-120(pA0), m0	mulpd	m1, m0	mulpd	NB3so+224-120(pA0), m1	addpd	m2, rC0	movapd	240-120(pB0), m2                                        addl $120, pB0	addpd	m3, rC1	movapd	NB0so+240-120(pA0), m3	mulpd	m2, m3	addpd	m0, rC2	movapd	NB1so+240-120(pA0), m0	mulpd	m2, m0	addpd	m1, rC3	movapd	NB2so+240-120(pA0), m1	mulpd	m2, m1	mulpd	NB3so+240-120(pA0), m2                                        addl $120, pA0	addpd	m3, rC0	movapd	256-120-120(pB0), m3	addpd	m0, rC1	movapd	NB0so+256-120-120(pA0), m0	mulpd	m3, m0	addpd	m1, rC2	movapd	NB1so+256-120-120(pA0), m1	mulpd	m3, m1	addpd	m2, rC3	movapd	NB2so+256-120-120(pA0), m2	mulpd	m3, m2	mulpd	NB3so+256-120-120(pA0), m3	addpd	m0, rC0	movapd	272-120-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+272-120-120(pA0), m1	mulpd	m0, m1	addpd	m2, rC2	movapd	NB1so+272-120-120(pA0), m2	mulpd	m0, m2	addpd	m3, rC3	movapd	NB2so+272-120-120(pA0), m3	mulpd	m0, m3	mulpd	NB3so+272-120-120(pA0), m0	addpd	m1, rC0	movapd	288-120-120(pB0), m1	addpd	m2, rC1	movapd	NB0so+288-120-120(pA0), m2	mulpd	m1, m2	addpd	m3, rC2	movapd	NB1so+288-120-120(pA0), m3	mulpd	m1, m3	addpd	m0, rC3	movapd	NB2so+288-120-120(pA0), m0	mulpd	m1, m0	mulpd	NB3so+288-120-120(pA0), m1	addpd	m2, rC0	movapd	304-120-120(pB0), m2	addpd	m3, rC1	movapd	NB0so+304-120-120(pA0), m3	mulpd	m2, m3	addpd	m0, rC2	movapd	NB1so+304-120-120(pA0), m0	mulpd	m2, m0	addpd	m1, rC3	movapd	NB2so+304-120-120(pA0), m1	mulpd	m2, m1	mulpd	NB3so+304-120-120(pA0), m2	addpd	m3, rC0	movapd	320-120-120(pB0), m3	addpd	m0, rC1	movapd	NB0so+320-120-120(pA0), m0	mulpd	m3, m0	addpd	m1, rC2	movapd	NB1so+320-120-120(pA0), m1	mulpd	m3, m1	addpd	m2, rC3	movapd	NB2so+320-120-120(pA0), m2	mulpd	m3, m2	mulpd	NB3so+320-120-120(pA0), m3	addpd	m0, rC0	movapd	336-120-120(pB0), m0	addpd	m1, rC1	movapd	NB0so+336-120-120(pA0), m1	mulpd	m0, m1	addpd	m2, rC2	movapd	NB1so+336-120-120(pA0), m2	mulpd	m0, m2	addpd	m3, rC3	movapd	NB2so+336-120-120(pA0), m3	mulpd	m0, m3	mulpd	NB3so+336-120-120(pA0), m0#ifdef BETAX   #ifdef DCPLX	addpd	m1, rC0        movlpd  32(pC), m1	addpd	m2, rC1        movhpd  48(pC), m1        movlpd  (pC), m2	addpd	m3, rC2                movapd  BETAOFF(%esp), m3        movhpd  16(pC), m2	addpd	m0, rC3        mulpd   m3, m2        mulpd   m1, m3   #else	addpd	m1, rC0                movapd  BETAOFF(%esp), m1	addpd	m2, rC1                movupd  (pC), m2	addpd	m3, rC2                movupd  16(pC), m3	addpd	m0, rC3        mulpd   m1, m2        mulpd   m1, m3   #endif#else	addpd	m1, rC0	addpd	m2, rC1	addpd	m3, rC2	addpd	m0, rC3#endif/* *      Get these bastard things summed up */                                        /* rC0 = c0a  c0b */                                        /* rC1 = c1a  c1b */                                        /* rC2 = c2a  c2b */                                        /* rC3 = c3a  c3b *//* */        movapd          rC0, m0   #ifdef BETA1        addsd   CMUL(24)(pC), rC3   #endif        unpcklpd        rC1, rC0        /* rC0 = c0a  c1a */                                        prefB((pfB))        unpckhpd        rC1, m0         /*  m0 = c0b  c1b */                                        addl    $32, pfB        addpd           m0, rC0         /* rC0 = c0ab c1ab */                                subl    $120, pB0        movapd          rC2, m0                                        pref2((pfA))        unpcklpd        rC3, rC2        /* rC2 = c2a  c3a */        unpckhpd        rC3, m0         /*  m0 = c2b  c3b */                                        addl    $32, pfA	                                addl	$NB4so-120, pA0        addpd           m0, rC2         /* rC2 = c2ab c3ab *//* *      Write results back to C */   #ifdef BETAX        addpd   m2, rC0        addpd   m3, rC2   #endif   #ifdef DCPLX	movlpd	rC0, (pC)	movhpd	rC0, 16(pC)	movlpd	rC2, 32(pC)	movhpd	rC2, 48(pC)   #else	movupd	rC0, (pC)	movupd	rC2, 16(pC)   #endif/* *      pC += 6;  pA += 2*NB */	addl	$CMUL(32), pC/* *      while (pA != stM); */	subb	$4, stM	jnz	MLOOP#endif/* *      pC += incCn;  pA -= NBNB;  pB += NB; */	addl	incCn, pC/*   	addl	COFF(%esp), pC */   #if MB == 0        subl    MKOFF(%esp), pA0   #else        subl    $MBKBso, pA0   #endif        addl    $NBso, pB/* *      while (pB != stN); */	sub	$1, stN	jnz	NLOOP/* *      Restore callee-saved iregs */   #ifndef BETAX	movl	12(%esp), %ebp	movl	 8(%esp), %ebx	movl	 4(%esp), %esi	movl	  (%esp), %edi	addl	$FSIZE, %esp   #else	movl	32(%esp), %ebp	movl	28(%esp), %ebx	movl	24(%esp), %esi	movl	20(%esp), %edi	movl	16(%esp), %esp   #endif	ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?