atl_dmm6x1x60pabc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,694 行 · 第 1/5 页

C
3,694
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define rsp  esp#elif !defined(ATL_GAS_x8664)   #error "This kernel requires a gas x86 assembler!"#endif#if !defined(NB) || (NB == 0)   #error "NB must be a compile-time constant!"#endif#if (NB != 60)   #error "NB must be 60!"#endif#if (NB/6)*6 != NB   #error "NB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8632   #define pC      %esi   #define pA      %ecx   #define pB      %edi   #define incCn   %eax   #define stM     %edx   #define stN     %ebx   #define pfA     %ebp#else   #define pC      %rsi   #define pA      %rcx   #define pB      %rdi   #define incCn   %rax   #define stM     %rdx   #define stN     %rbx   #define pfA	   %rbp   /*       rax     used in 32/64 conversion */#endif#define NBso	(NB*8)#define NBNBso  (NB*NB*8)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so   (NB6so+NBso)#define NB8so   (NB6so+NB2so)#define NB9so   (NB6so+NB3so)#define NB10so   (NB6so+NB4so)#define NB11so   (NB6so+NB5so)/* * Prefetch defines */#define pref2(mem) prefetcht1	mem#define prefB(mem) prefetcht0	mem#define prefC(mem) prefetchw	mem/* *void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, *                const TYPE *A, const int lda, const TYPE *B, const int ldb, *                const TYPE beta, TYPE *C, const int ldc) */	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):#ifdef ATL_GAS_x8632/* *      Save callee-saved iregs */	subl	$24, %esp	movl	%ebp, 20(%esp)	movl	%ebx, 16(%esp)	movl	%esi, 12(%esp)	movl	%edi,  8(%esp)   #ifdef BETAX   	fldl	64(%esp)	fstpl	(%esp)      #define BETAOFF 0   #endif/* *      Initialize pA = A;  pB = B; pC = C; */	movl	48(%esp), pA	movl	56(%esp), pB	movl	72(%esp), pC	prefC((pC))	prefC(64(pC))/* *      stM = pA + NBNB-6*NB;  pfA = pA+NBNB;  stN = pB + NBNB; */	movl	$NBNBso-NB6so, stM	addl	pA, stM	movl	stM, pfA	addl	$NB6so, pfA	movl	$NBNBso, stN	addl	pB, stN/* *      Set incCn = (ldc - NB)*sizeof */	movl	76(%esp), incCn	prefB((pB))	prefB(64(pB))	subl	$(MB-6), incCn	shl	$3, incCn#else/* *      Save callee-saved iregs */	movq	%rbp, -8(%rsp)	movq	%rbx, -16(%rsp)   #ifdef BETAX	movsd	%xmm1, -24(%rsp)      #define BETAOFF -24   #endif/* *      pA already comes in right reg *      Initialize pB = B; pC = C; NBso = NB * sizeof; */	movq	%r9, pB	movq	16(%rsp), pC	prefC((pC))	prefC(64(pC))/* *      stM = pA + NBNBso;  stN = pB + NBNBso; */	movq	$NBNBso-NB6so, stM	addq	pA, stM	movq	stM, pfA	addq	$NB6so, pfA	movq	$NBNBso, stN	addq	pB, stN/* *      convert ldc to 64 bits, and then set incCn = (ldc - NB)*sizeof */	movl	24(%rsp), %eax	cltq	prefB((pB))	prefB(64(pB))/*	movq	%rax, incCn */	subq	$NB, incCn	shl	$3, incCn	addq	$48, incCn#endifNLOOP:/* *      stK = pB + NBso *//*	movq	pB, stK *//*	addq	$NBso, stK */MLOOP:/* *Load C, apply beta.  Stack will be: * st(0)  temp * st(1)  temp * st(2)  pC[0] * st(3)  pC[1] * st(4)  pC[2] * st(5)  pC[3] * st(6)  pC[4] * st(7)  pC[5] */#ifdef BETA0	fldz	fldz	fldz	fldz	fldz	fldz#else	fldl	40(pC)	fldl	32(pC)	fldl	24(pC)	fldl	16(pC)	fldl	8(pC)	fldl	(pC)   #ifdef BETAX	fldl	BETAOFF(%rsp)	fmul	%st, %st(1)	fmul	%st, %st(2)	fmul	%st, %st(3)	fmul	%st, %st(4)	fmul	%st, %st(5)	fmulp	%st, %st(6)   #endif#endif	ALIGN16/*KLOOP: */	pref2((pfA))	fldl	(pB)	fldl	(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	8(pB)	fldl	8(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	8+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	8+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	8+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	8+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	8+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	pref2(64(pfA))	fldl	16(pB)	fldl	16(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	16+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	16+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	16+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	16+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	16+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	24(pB)	fldl	24(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	24+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	24+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	24+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	24+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	24+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	pref2(128(pfA))	addq	$160, pfA	fldl	32(pB)	fldl	32(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	32+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	32+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	32+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	32+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	32+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	40(pB)	fldl	40(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	40+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	40+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	40+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	40+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	40+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	48(pB)	fldl	48(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	48+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	48+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	48+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	48+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	48+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	56(pB)	fldl	56(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	56+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	56+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	56+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	56+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	56+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	64(pB)	fldl	64(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	64+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	64+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	64+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	64+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	64+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	72(pB)	fldl	72(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	72+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	72+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	72+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	72+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	72+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	80(pB)	fldl	80(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	80+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	80+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	80+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	80+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	80+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	88(pB)	fldl	88(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	88+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	88+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	88+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	88+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	88+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	96(pB)	fldl	96(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	96+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	96+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	96+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	96+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	96+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	104(pB)	fldl	104(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	104+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	104+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	104+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	104+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	104+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	112(pB)	fldl	112(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	112+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	112+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	112+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	112+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	112+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	120(pB)	fldl	120(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	120+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	120+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	120+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	120+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	120+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	128(pB)	fldl	128(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	128+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	128+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	128+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	128+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	128+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	136(pB)	fldl	136(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	136+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	136+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	136+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	136+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	136+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	144(pB)	fldl	144(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	144+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	144+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	144+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	144+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	144+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	152(pB)	fldl	152(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	152+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	152+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	152+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	152+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	152+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	160(pB)	fldl	160(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	160+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	160+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	160+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	160+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	160+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	168(pB)	fldl	168(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	168+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	168+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	168+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	168+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	168+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	176(pB)	fldl	176(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	176+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	176+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	176+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	176+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	176+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	184(pB)	fldl	184(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	184+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	184+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	184+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	184+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	184+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	192(pB)	fldl	192(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	192+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	192+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	192+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	192+NB4so(pA)	fmul	%st(1), %st	faddp	%st, %st(6)	fldl	192+NB5so(pA)	fmulp	%st, %st(1)	faddp	%st, %st(6)	fldl	200(pB)	fldl	200(pA)	fmul	%st(1), %st	faddp	%st, %st(2)	fldl	200+NBso(pA)	fmul	%st(1), %st	faddp	%st, %st(3)	fldl	200+NB2so(pA)	fmul	%st(1), %st	faddp	%st, %st(4)	fldl	200+NB3so(pA)	fmul	%st(1), %st	faddp	%st, %st(5)	fldl	200+NB4so(pA)	fmul	%st(1), %st

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?