atl_dmm14x1x56_sse2pabc_k.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,047 行 · 第 1/5 页

C
3,047
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664   #error "This kernel requires x86-64 assembly!"#endif#ifdef ATL_OS_SunOS   #define ATL_DIV_NUM MB   #define ATL_DIV_DEN 14#endif#include "atlas_asm.h"#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if ((KB/2)*2 == KB)#if NB != MB   #error "For this kernel, MB = NB required!"#endif#if (MB/14)*14 != MB   #error "MB must be multiple of 14!"#endif#ifdef DREAL   #define CMUL(arg_) arg_#else   #define CMUL(arg_) 2*arg_#endif/* * Integer register usage shown be these defines */#define pA      %rcx#define pA10	%rbx#define ldab	%rbp#define mldab	%rdx#define mldab5  %rax#define pB      %rdi#define pC      %rsi#define incCn   %r10#define stM     %r9#define stN     %r11#define pfA	%r8#define pA5 	pA#define pB0	pB/*       rax     used in 32/64 conversion */#define NBso	(KB*8)#define MBKBso  (MB*KB*8)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NB3so+NB3so)#define NB7so   (NB3so+NB4so)#define NB8so   (NB4so+NB4so)#define NB9so   (NB4so+NB5so)#define NB10so   (NB5so+NB5so)#define NB11so   (NB6so+NB5so)#define NB12so   (NB7so+NB5so)#define NB13so   (NB8so+NB5so)#define NB14so   (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0	%xmm0#define rB0	%xmm1#define rC0	%xmm2#define rC1	%xmm3#define rC2	%xmm4#define rC3	%xmm5#define rC4	%xmm6#define rC5	%xmm7#define rC6	%xmm8#define rC7	%xmm9#define rC8	%xmm10#define rC9	%xmm11#define rC10	%xmm12#define rC11	%xmm13#define rC12	%xmm14#define rC13	%xmm15/* *  Prefetch defines */#if 1#define pref2(mem) prefetcht1   mem#define prefB(mem) prefetcht0   mem#define prefC(mem) prefetchw    mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs */	movq	%rbp, -8(%rsp)	movq	%rbx, -16(%rsp)/*	movq	%r12, -32(%rsp) *//*	movq	%r13, -40(%rsp) */#ifdef BETAX   #define BOF -24	movlpd	%xmm1, BOF(%rsp)#endif/* *      pA already comes in right reg *      Initialize pB = B; pC = C; */	movq	16(%rsp), pC			prefC((pC))			prefC(64(pC))	movq	%r9, pB			prefB((pB))			prefB(64(pB))/* *      setup prefetch ptr for next blk of A */	movq	$MBKBso, pfA	addq	pA5, pfA			prefB(128(pB))/* *      convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */	movl	24(%rsp), %eax	cltq	movq	%rax, incCn#ifdef DREAL	subq	$MB-14, incCn	shl	$3, incCn#else	subq	$(MB-14), incCn	shl	$4, incCn			prefC(128(pC))			prefC(192(pC))#endif	addq	$120, pA5	addq	$120, pB0	movq	$KB*8, ldab	movq	$-KB*5*8, mldab5	movq	$-KB*8, mldab	subq	mldab5, pA5	lea	KB*8(pA5, ldab,4), pA10	movq	$NB, stNUNLOOP:   #ifdef ATL_DivAns	movq	$ATL_DivAns-1, stM   #else	movq	$MB/14-1, stM   #endifUMLOOP:/* *	rC[0-13] = pC[0-13] * beta */	ALIGN16/*UKLOOP: */#ifdef BETA1	movapd	0-120(pA10,mldab5,2), rC0	movapd	0-120(pB0), rB0	mulpd	rB0, rC0	addsd	(pC), rC0	movapd	0-120(pA5, mldab,4), rC1	mulpd	rB0, rC1	addsd	CMUL(8)(pC), rC1	movapd	0-120(pA10, mldab,8), rC2	mulpd	rB0, rC2	addsd	CMUL(16)(pC), rC2	movapd	0-120(pA5, mldab,2), rC3	mulpd	rB0, rC3	addsd	CMUL(24)(pC), rC3	movapd	0-120(pA5, mldab), rC4	mulpd	rB0, rC4	addsd	CMUL(32)(pC), rC4	movapd	0-120(pA5), rC5	mulpd	rB0, rC5	addsd	CMUL(40)(pC), rC5	movapd	0-120(pA5, ldab), rC6	mulpd	rB0, rC6	addsd	CMUL(48)(pC), rC6	movapd	0-120(pA5, ldab,2), rC7	mulpd	rB0, rC7	addsd	CMUL(56)(pC), rC7	movapd	0-120(pA10, mldab,2), rC8	mulpd	rB0, rC8	addsd	CMUL(64)(pC), rC8	movapd	0-120(pA5,ldab,4), rC9	mulpd	rB0, rC9	addsd	CMUL(72)(pC), rC9	movapd	0-120(pA10), rC10	mulpd	rB0, rC10	addsd	CMUL(80)(pC), rC10	movapd	0-120(pA10,ldab), rC11	mulpd	rB0, rC11	addsd	CMUL(88)(pC), rC11	movapd	0-120(pA10,ldab,2), rC12	mulpd	rB0, rC12	addsd	CMUL(96)(pC), rC12	movapd	0-120(pA5,ldab,8), rC13	mulpd	rB0, rC13	addsd	CMUL(104)(pC), rC13#elif defined(BETA0)	movapd	0-120(pA10,mldab5,2), rC0	movapd	0-120(pB0), rC13	mulpd	rC13, rC0	movapd	0-120(pA5, mldab,4), rC1	mulpd	rC13, rC1	movapd	0-120(pA10, mldab,8), rC2	mulpd	rC13, rC2	movapd	0-120(pA5, mldab,2), rC3	mulpd	rC13, rC3	movapd	0-120(pA5, mldab), rC4	mulpd	rC13, rC4	movapd	0-120(pA5), rC5	mulpd	rC13, rC5	movapd	0-120(pA5, ldab), rC6	mulpd	rC13, rC6	movapd	0-120(pA5, ldab,2), rC7	mulpd	rC13, rC7	movapd	0-120(pA10, mldab,2), rC8	mulpd	rC13, rC8	movapd	0-120(pA5,ldab,4), rC9	mulpd	rC13, rC9	movapd	0-120(pA10), rC10	mulpd	rC13, rC10	movapd	0-120(pA10,ldab), rC11	mulpd	rC13, rC11	movapd	0-120(pA10,ldab,2), rC12	mulpd	rC13, rC12	mulpd 	0-120(pA5,ldab,8), rC13#else	movsd	BOF(%rsp), rC0	movapd	rC0, rC1	movapd	rC0, rC2	movapd	rC0, rC3	movapd	rC0, rC4	movapd	rC0, rC5	movapd	rC0, rC6	movapd	rC0, rC7	movapd	rC0, rC8	movapd	rC0, rC9	movapd	rC0, rC10	movapd	rC0, rC11	movapd	rC0, rC12	movapd	rC0, rC13	mulsd	(pC), rC0	mulsd	CMUL(8)(pC), rC1	mulsd	CMUL(16)(pC), rC2	mulsd	CMUL(24)(pC), rC3	mulsd	CMUL(32)(pC), rC4	mulsd	CMUL(40)(pC), rC5	mulsd	CMUL(48)(pC), rC6	mulsd	CMUL(56)(pC), rC7	mulsd	CMUL(64)(pC), rC8	mulsd	CMUL(72)(pC), rC9	mulsd	CMUL(80)(pC), rC10	mulsd	CMUL(88)(pC), rC11	mulsd	CMUL(96)(pC), rC12	mulsd	CMUL(104)(pC), rC13	movapd	0-120(pA10,mldab5,2), rA0	movapd	0-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	0-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	0-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	0-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	0-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	0-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	0-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	0-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	0-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	0-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	0-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	0-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	0-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	0-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#if KB > 2	movapd	16-120(pA10,mldab5,2), rA0	movapd	16-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	16-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	16-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	16-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	16-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	16-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	16-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	16-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	16-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	16-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	16-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	16-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	16-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	16-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#if KB > 4	movapd	32-120(pA10,mldab5,2), rA0	movapd	32-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	32-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	32-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	32-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	32-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	32-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	32-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	32-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	32-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	32-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	32-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	32-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	32-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	32-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#if KB > 6	movapd	48-120(pA10,mldab5,2), rA0	movapd	48-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	48-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	48-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	48-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	48-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	48-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	48-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	48-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	48-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	48-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	48-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	48-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	48-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	48-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#ifndef DREAL						pref2((pfA))						pref2(64(pfA))#endif#if KB > 8	movapd	64-120(pA10,mldab5,2), rA0	movapd	64-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	64-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	64-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	64-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	64-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	64-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	64-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	64-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	64-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	64-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	64-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	64-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	64-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	64-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#if KB > 10	movapd	80-120(pA10,mldab5,2), rA0	movapd	80-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	80-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	80-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	80-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	80-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	80-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	80-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	80-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	80-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	80-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	80-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	80-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	80-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	80-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif#if KB > 12	movapd	96-120(pA10,mldab5,2), rA0	movapd	96-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	96-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	96-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	96-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	96-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	96-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	96-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	96-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	96-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	96-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	96-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	96-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	96-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	96-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?