atl_smm4x1x60_4_sse2.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 481 行

C
481
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2004 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* *Efficeon-optimized 4x1x60 SGEMM.  Pipelined to 4 (4 accumulators). *Prefetches the next col of B, and a col from the next block of A in the M-loop *Purposely kept small so it is retained in cache, and easy to translate when *not */#ifndef ATL_GAS_x8632   #error "This kernel requires gas x86-32 assembler!"#endif/*#if KB != 60   #error "KB must be 60!"#endif*/#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if !defined(NB)   #define NB 0#endif#if !defined(MB)   #define MB 0#endif#if (MB/4)*4 != MB   #error "MB must be multiple of 4!"#endif#ifdef SCPLX   #define OFF 16   #define CMUL(i_) (2*(i_))#else   #define OFF 8   #define CMUL(i_) i_#endif/* * Integer register usage shown be these defines */#define pC      %esi#define pA      %ecx#define pB      %edi#define incCn   %eax#define stM	%bl#define stN	%bh#define pfB  	%edx#define pfA     %ebp#define pA0	pA#define pB0	pB#define m0      %xmm0#define m1      %xmm1#define m2      %xmm2#define m3      %xmm3#define rC0     %xmm4#define rC1     %xmm5#define rC2     %xmm6#define rC3     %xmm7#define NB0so   0#define NBso	(KB*4)#define NB1so	(KB*4)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so   (NB6so+NBso)#define NB8so   (NB6so+NB2so)#define NB9so   (NB6so+NB3so)#define NB10so   (NB6so+NB4so)#define NB11so   (NB6so+NB5so)#if MB != 0   #define MBKBso  (MB*KB*4)#endif/* * Prefetch defines */#if 1   #define pref2(mem) prefetcht0	mem   #define prefB(mem) prefetcht0	mem   #define prefC(mem) prefetcht0	mem#else   #define pref2(mem)   #define prefB(mem)   #define prefC(mem)#endif/*offset                    4            8           12                16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *offset                     20             24             28            32 *                const TYPE *A, const int lda, const TYPE *B, const int ldb, *offset                       36       40             44 *                const TYPE beta, TYPE *C, const int ldc) */	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs; Save old stack pointer in eax, *      so we can adjust for BETA alignment */#define FSIZE 28#define BETAOFF FSIZE+36#define COFF 16	subl	$FSIZE, %esp	movl	%ebp, 12(%esp)	movl	%ebx,  8(%esp)	movl	%esi,  4(%esp)	movl	%edi,   (%esp)/* *      Initialize pA = A;  pB = B; pC = C; */#if MB == 0        movl    FSIZE+4(%esp), %ebx        movl    %ebx, COFF+4(%esp)        imul    $NBso, %ebx        movl    %ebx, COFF+8(%esp)#endif	movl	FSIZE+20(%esp), pA	movl	FSIZE+28(%esp), pB	movl	FSIZE+40(%esp), pC#if NB == 0        movb    FSIZE+8(%esp), stN#else        movb    $NB, stN#endif/* *      Set incCn = (ldc - MB)*sizeof */	movl	FSIZE+44(%esp), incCn   #if MB == 0        subl    COFF+4(%esp), incCn   #else	subl	$MB, incCn   #endif   #ifdef SCPLX	shl	$3, incCn   #else	shl	$2, incCn   #endif/*   	movl	incCn, COFF(%esp) */        movl    pA0, pfA#if MB == 0        addl    COFF+8(%esp), pfA#else        addl    $MBKBso, pfA#endif        addl    $120, pA0        addl    $120, pB0NLOOP:#if MB == 0        movb    COFF+4(%esp), stM#else        movb     $MB, stM#endif        lea     120+NBso(pB0), pfBMLOOP:#ifdef BETA0	xorps	rC0, rC0	xorps	rC1, rC1	xorps	rC2, rC2	xorps	rC3, rC3#else	movss	(pC), rC0	movss	CMUL(4)(pC), rC1	movss	CMUL(8)(pC), rC2	movss	CMUL(12)(pC), rC3   #ifdef BETAX	movss	BETAOFF(%esp), m0	mulss	m0, rC0	mulss	m0, rC1	mulss	m0, rC2	mulss	m0, rC3   #endif#endif	movaps	0-120(pB0), m3	movaps	0-120(pA0), m0	movaps	NBso+0-120(pA0), m1	movaps	NB2so+0-120(pA0), m2	mulps	m3, m0	mulps	m3, m1	mulps	m3, m2/* *      Unrolled & pipelined K-loop */	mulps	NB3so+0-120(pA0), m3	addps	m0, rC0	movaps	16-120(pB0), m0	addps	m1, rC1	movaps	NB0so+16-120(pA0), m1	mulps	m0, m1	addps	m2, rC2	movaps	NB1so+16-120(pA0), m2	mulps	m0, m2	addps	m3, rC3	movaps	NB2so+16-120(pA0), m3	mulps	m0, m3	mulps	NB3so+16-120(pA0), m0	addps	m1, rC0	movaps	32-120(pB0), m1	addps	m2, rC1	movaps	NB0so+32-120(pA0), m2	mulps	m1, m2	addps	m3, rC2	movaps	NB1so+32-120(pA0), m3	mulps	m1, m3	addps	m0, rC3	movaps	NB2so+32-120(pA0), m0	mulps	m1, m0	mulps	NB3so+32-120(pA0), m1	addps	m2, rC0	movaps	48-120(pB0), m2	addps	m3, rC1	movaps	NB0so+48-120(pA0), m3	mulps	m2, m3	addps	m0, rC2	movaps	NB1so+48-120(pA0), m0	mulps	m2, m0	addps	m1, rC3	movaps	NB2so+48-120(pA0), m1	mulps	m2, m1	mulps	NB3so+48-120(pA0), m2	addps	m3, rC0	movaps	64-120(pB0), m3	addps	m0, rC1	movaps	NB0so+64-120(pA0), m0	mulps	m3, m0	addps	m1, rC2	movaps	NB1so+64-120(pA0), m1	mulps	m3, m1	addps	m2, rC3	movaps	NB2so+64-120(pA0), m2	mulps	m3, m2	mulps	NB3so+64-120(pA0), m3	addps	m0, rC0	movaps	80-120(pB0), m0	addps	m1, rC1	movaps	NB0so+80-120(pA0), m1	mulps	m0, m1	addps	m2, rC2	movaps	NB1so+80-120(pA0), m2	mulps	m0, m2	addps	m3, rC3	movaps	NB2so+80-120(pA0), m3	mulps	m0, m3	mulps	NB3so+80-120(pA0), m0	addps	m1, rC0	movaps	96-120(pB0), m1	addps	m2, rC1	movaps	NB0so+96-120(pA0), m2	mulps	m1, m2	addps	m3, rC2	movaps	NB1so+96-120(pA0), m3	mulps	m1, m3	addps	m0, rC3	movaps	NB2so+96-120(pA0), m0	mulps	m1, m0	mulps	NB3so+96-120(pA0), m1	addps	m2, rC0	movaps	112-120(pB0), m2	addps	m3, rC1	movaps	NB0so+112-120(pA0), m3	mulps	m2, m3	addps	m0, rC2	movaps	NB1so+112-120(pA0), m0	mulps	m2, m0	addps	m1, rC3	movaps	NB2so+112-120(pA0), m1	mulps	m2, m1	mulps	NB3so+112-120(pA0), m2	addps	m3, rC0	movaps	128-120(pB0), m3	addps	m0, rC1	movaps	NB0so+128-120(pA0), m0	mulps	m3, m0	addps	m1, rC2	movaps	NB1so+128-120(pA0), m1	mulps	m3, m1	addps	m2, rC3	movaps	NB2so+128-120(pA0), m2	mulps	m3, m2	mulps	NB3so+128-120(pA0), m3	addps	m0, rC0	movaps	144-120(pB0), m0	addps	m1, rC1	movaps	NB0so+144-120(pA0), m1	mulps	m0, m1	addps	m2, rC2	movaps	NB1so+144-120(pA0), m2	mulps	m0, m2	addps	m3, rC3	movaps	NB2so+144-120(pA0), m3	mulps	m0, m3	mulps	NB3so+144-120(pA0), m0	addps	m1, rC0	movaps	160-120(pB0), m1	addps	m2, rC1	movaps	NB0so+160-120(pA0), m2	mulps	m1, m2	addps	m3, rC2	movaps	NB1so+160-120(pA0), m3	mulps	m1, m3	addps	m0, rC3	movaps	NB2so+160-120(pA0), m0	mulps	m1, m0	mulps	NB3so+160-120(pA0), m1	addps	m2, rC0	movaps	176-120(pB0), m2	addps	m3, rC1	movaps	NB0so+176-120(pA0), m3	mulps	m2, m3	addps	m0, rC2	movaps	NB1so+176-120(pA0), m0	mulps	m2, m0	addps	m1, rC3	movaps	NB2so+176-120(pA0), m1	mulps	m2, m1	mulps	NB3so+176-120(pA0), m2	addps	m3, rC0	movaps	192-120(pB0), m3	addps	m0, rC1	movaps	NB0so+192-120(pA0), m0	mulps	m3, m0	addps	m1, rC2	movaps	NB1so+192-120(pA0), m1	mulps	m3, m1	addps	m2, rC3	movaps	NB2so+192-120(pA0), m2	mulps	m3, m2	mulps	NB3so+192-120(pA0), m3	addps	m0, rC0	movaps	208-120(pB0), m0	addps	m1, rC1	movaps	NB0so+208-120(pA0), m1	mulps	m0, m1	addps	m2, rC2	movaps	NB1so+208-120(pA0), m2	mulps	m0, m2	addps	m3, rC3	movaps	NB2so+208-120(pA0), m3	mulps	m0, m3	mulps	NB3so+208-120(pA0), m0	addps	m1, rC0	movaps	224-120(pB0), m1	addps	m2, rC1	movaps	NB0so+224-120(pA0), m2	mulps	m1, m2	addps	m3, rC2	movaps	NB1so+224-120(pA0), m3	mulps	m1, m3	addps	m0, rC3	movaps	NB2so+224-120(pA0), m0	mulps	m1, m0	mulps	NB3so+224-120(pA0), m1	addps	m2, rC0	addps	m3, rC1	addps	m0, rC2	addps	m1, rC3/* *      Get these bastard things summed up correctly *      Note this summation is Camm's, as his sequence was faster *      than the piece of crap I came up with */        movaps          rC0, m0        /* m0 = c0d    c0c    c0b    c0a */        unpcklps        rC1, rC0        /* rC0 = c1b    c0b    c1a    c0d */        movaps          rC2, m1        /* m1 = c2d    c2c    c2b    c2a */        unpckhps        rC1, m0        /* m0 = c1d    c0d    c1c    c0c */                                        prefB((pfB))        unpcklps        rC3, rC2        /* rC2 = c3b    c2b    c3a    c2a */                                        addl    $16, pfB        addps           m0, rC0        /* rC0 = c1bd   c0bd   c1ac   c0ac */        unpckhps        rC3, m1        /* m1 = c3d    c2d    c3c    c2c */	                                addl	$NB4so, pA0        movaps          rC0, m0        /* m0 = c1bd   c0bd   c1ac   c0ac */        addps           m1, rC2        /* rC2 = c3bd   c2bd   c3ac   c2ac */        shufps          $0x44,rC2,rC0   /* rC0 = c3ac   c2ac   c1ac   c0ac */                                        pref2((pfA))        shufps          $0xEE,rC2,m0   /* m0 = c3bd   c2bd   c1bd   c0bd */                                        addl    $16, pfA        addps           m0, rC0        /* rC0 = c3abcd c2abcd c1abcd c0abcd */                                        /* rC1 = c1a  c1b */                                        /* rC2 = c2a  c2b *//* *      Write results back to C */   #ifdef SCPLX                                        /* rC0 = c3 c2 c1 c0 */        pshufd  $0xB1, rC0, rC1         /* rC1 = c2 c3 c0 c1 */        movhlps rC0, rC2                /* rC2 =  X  X c3 c2 */        movhlps rC1, rC3                /* rC3 =  X  X c2 c3 */        movss   rC0, (pC)        movss   rC1, 8(pC)        movss   rC2, 16(pC)        movss   rC3, 24(pC)   #else        movups  rC0, (pC)   #endif/* *      pC += 6;  pA += 2*NB */	addl	$CMUL(16), pC/* *      while (pA != stM); */	subb	$4, stM	jnz	MLOOP/* *      pC += incCn;  pA -= NBNB;  pB += NB; */	addl	incCn, pC/*   	addl	COFF(%esp), pC */   #if MB == 0        subl    COFF+8(%esp), pA0   #else        subl    $MBKBso, pA0   #endif        addl    $NBso, pB/* *      while (pB != stN); */	sub	$1, stN	jnz	NLOOP/* *      Restore callee-saved iregs */	movl	12(%esp), %ebp	movl	 8(%esp), %ebx	movl	 4(%esp), %esi	movl	  (%esp), %edi	addl	$FSIZE, %esp	ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?