atl_smm14x1x84_ssecu.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,730 行 · 第 1/5 页

C
2,730
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664   #error "This kernel requires x86-64 assembly!"#endif#ifdef ATL_OS_SunOS   #define ATL_DIV_NUM MB   #define ATL_DIV_DEN 14#endif#include "atlas_asm.h"#ifndef NB   #define NB 0#endif#ifndef MB   #define MB 0#endif#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if (MB/14)*14 != MB   #error "MB must be multiple of 14!"#endif#ifdef SREAL   #define CMUL(arg_) arg_#else   #define CMUL(arg_) 2*arg_#endif/* * Integer register usage shown be these defines */#define pA      %rcx#define pA10	%rbx#define ldab	%rbp#define mldab	%rdx#define mldab5  %rax#define pB      %rdi#define pC      %rsi#define incCn   %r10#define stM     %r9#define stN     %r11#define pfA	%r8#define pA5 	pA#define pB0	pB#if MB == 0   #define	stM0	%r12   #define	incAm	%r13#endif/*       rax     used in 32/64 conversion */#define NBso	(KB*4)#define MBKBso  (MB*KB*4)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NB3so+NB3so)#define NB7so   (NB3so+NB4so)#define NB8so   (NB4so+NB4so)#define NB9so   (NB4so+NB5so)#define NB10so   (NB5so+NB5so)#define NB11so   (NB6so+NB5so)#define NB12so   (NB7so+NB5so)#define NB13so   (NB8so+NB5so)#define NB14so   (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0	%xmm0#define rB0	%xmm1#define rC0	%xmm2#define rC1	%xmm3#define rC2	%xmm4#define rC3	%xmm5#define rC4	%xmm6#define rC5	%xmm7#define rC6	%xmm8#define rC7	%xmm9#define rC8	%xmm10#define rC9	%xmm11#define rC10	%xmm12#define rC11	%xmm13#define rC12	%xmm14#define rC13	%xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1   mem#define prefB(mem) prefetcht0   mem#define prefC(mem) prefetchw    mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif#if MB != 0   #define incAm $MBKBso-NB14so+176#endif	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs */	movq	%rbp, -8(%rsp)	movq	%rbx, -16(%rsp)#if MB == 0	movq	%r12, -32(%rsp)	movq	%r13, -40(%rsp)#endif#ifdef BETAX   #define BOF -56	movss	%xmm1, BOF(%rsp)	movss	%xmm1, BOF+4(%rsp)	movss	%xmm1, BOF+8(%rsp)	movss	%xmm1, BOF+12(%rsp)#endif/* *      pA already comes in right reg *      Initialize pB = B; pC = C; NBso = NB * sizeof; */	movq	%rsi, stN	movq	%rdi, %rax	movq	16(%rsp), pC			prefC((pC))			prefC(64(pC))	movq	%r9, pB			prefB((pB))			prefB(64(pB))	movq	%rax, stM/* *      stM = pA + NBNBso;  stN = pB + NBNBso; */#if MB == 0	movq	stM, pfA	imulq	$NBso, pfA			prefB(128(pB))	movq	pfA, incAm	addq	pA5, pfA	addq	$176-NB14so, incAm#else	movq	$MBKBso, pfA	addq	pA5, pfA			prefB(128(pB))#endif/* *      convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */	movl	24(%rsp), %eax	cltq	movq	%rax, incCn	subq	stM, incCn	addq	$14, incCn#ifdef SREAL	shl	$2, incCn#else	shl	$3, incCn			prefC(128(pC))			prefC(192(pC))#endif/* *      Find M/14 if MB is not set */#if MB == 0	cmp	$84, stM	jne	MB_LT84/*      movq	$84/14, stM */	movq	$6, stMMBFOUND:	subq	$1, stM	movq	stM, stM0#endif	addq	$120, pA5	addq	$120, pB0	movq	$KB*4, ldab	movq	$-KB*5*4, mldab5	movq	$-KB*4, mldab	subq	mldab5, pA5	lea	KB*4(pA5, ldab,4), pA10/*	movq	$NB, stN */UNLOOP:#if MB == 0	movq	stM0, stM	cmp	$0, stM	je	MLAST#else   #ifdef ATL_DivAns	movq	$ATL_DivAns-1, stM   #else	movq	$MB/14-1, stM   #endif#endif#if MB == 0 || MB > 14UMLOOP:/* *      rC[0-13] = pC[0-13] * beta */	ALIGN16/*UKLOOP: */#ifdef BETA1	movaps	0-120(pA10,mldab5,2), rC0	movaps	0-120(pB0), rB0	mulps	rB0, rC0	addss	(pC), rC0	movaps	0-120(pA5, mldab,4), rC1	mulps	rB0, rC1	addss	CMUL(4)(pC), rC1	movaps	0-120(pA10, mldab,8), rC2	mulps	rB0, rC2	addss	CMUL(8)(pC), rC2	movaps	0-120(pA5, mldab,2), rC3	mulps	rB0, rC3	addss	CMUL(12)(pC), rC3	movaps	0-120(pA5, mldab), rC4	mulps	rB0, rC4	addss	CMUL(16)(pC), rC4	movaps	0-120(pA5), rC5	mulps	rB0, rC5	addss	CMUL(20)(pC), rC5	movaps	0-120(pA5, ldab), rC6	mulps	rB0, rC6	addss	CMUL(24)(pC), rC6	movaps	0-120(pA5, ldab,2), rC7	mulps	rB0, rC7	addss	CMUL(28)(pC), rC7	movaps	0-120(pA10, mldab,2), rC8	mulps	rB0, rC8	addss	CMUL(32)(pC), rC8	movaps	0-120(pA5,ldab,4), rC9	mulps	rB0, rC9	addss	CMUL(36)(pC), rC9	movaps	0-120(pA10), rC10	mulps	rB0, rC10	addss	CMUL(40)(pC), rC10	movaps	0-120(pA10,ldab), rC11	mulps	rB0, rC11	addss	CMUL(44)(pC), rC11	movaps	0-120(pA10,ldab,2), rC12	mulps	rB0, rC12	addss	CMUL(48)(pC), rC12	movaps	0-120(pA5,ldab,8), rC13	mulps	rB0, rC13	addss	CMUL(52)(pC), rC13#else	movaps	0-120(pA10,mldab5,2), rC0	movaps	0-120(pB0), rC13	mulps	rC13, rC0	movaps	0-120(pA5, mldab,4), rC1	mulps	rC13, rC1	movaps	0-120(pA10, mldab,8), rC2	mulps	rC13, rC2	movaps	0-120(pA5, mldab,2), rC3	mulps	rC13, rC3	movaps	0-120(pA5, mldab), rC4	mulps	rC13, rC4	movaps	0-120(pA5), rC5	mulps	rC13, rC5	movaps	0-120(pA5, ldab), rC6	mulps	rC13, rC6	movaps	0-120(pA5, ldab,2), rC7	mulps	rC13, rC7	movaps	0-120(pA10, mldab,2), rC8	mulps	rC13, rC8	movaps	0-120(pA5,ldab,4), rC9	mulps	rC13, rC9	movaps	0-120(pA10), rC10	mulps	rC13, rC10	movaps	0-120(pA10,ldab), rC11	mulps	rC13, rC11	movaps	0-120(pA10,ldab,2), rC12	mulps	rC13, rC12	mulps 	0-120(pA5,ldab,8), rC13#endif#if KB > 4	movaps	16-120(pA10,mldab5,2), rA0	movaps	16-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	16-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	16-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	16-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	16-120(pA5, mldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	movaps	16-120(pA5), rA0	mulps	rB0, rA0	addps	rA0, rC5	movaps	16-120(pA5, ldab), rA0	mulps	rB0, rA0	addps	rA0, rC6	movaps	16-120(pA5, ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC7	movaps	16-120(pA10, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC8	movaps	16-120(pA5,ldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC9	movaps	16-120(pA10), rA0	mulps	rB0, rA0	addps	rA0, rC10	movaps	16-120(pA10,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC11	movaps	16-120(pA10,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC12	mulps	16-120(pA5,ldab,8), rB0	addps	rB0, rC13#endif#if KB > 8	movaps	32-120(pA10,mldab5,2), rA0	movaps	32-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	32-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	32-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	32-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	32-120(pA5, mldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	movaps	32-120(pA5), rA0	mulps	rB0, rA0	addps	rA0, rC5	movaps	32-120(pA5, ldab), rA0	mulps	rB0, rA0	addps	rA0, rC6	movaps	32-120(pA5, ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC7	movaps	32-120(pA10, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC8	movaps	32-120(pA5,ldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC9	movaps	32-120(pA10), rA0	mulps	rB0, rA0	addps	rA0, rC10	movaps	32-120(pA10,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC11	movaps	32-120(pA10,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC12	mulps	32-120(pA5,ldab,8), rB0	addps	rB0, rC13#endif#if KB > 12	movaps	48-120(pA10,mldab5,2), rA0	movaps	48-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	48-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	48-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	48-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	48-120(pA5, mldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	movaps	48-120(pA5), rA0	mulps	rB0, rA0	addps	rA0, rC5	movaps	48-120(pA5, ldab), rA0	mulps	rB0, rA0	addps	rA0, rC6	movaps	48-120(pA5, ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC7	movaps	48-120(pA10, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC8	movaps	48-120(pA5,ldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC9	movaps	48-120(pA10), rA0	mulps	rB0, rA0	addps	rA0, rC10	movaps	48-120(pA10,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC11	movaps	48-120(pA10,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC12	mulps	48-120(pA5,ldab,8), rB0	addps	rB0, rC13#endif#if KB > 16	movaps	64-120(pA10,mldab5,2), rA0	movaps	64-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	64-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	64-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	64-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	64-120(pA5, mldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	movaps	64-120(pA5), rA0	mulps	rB0, rA0	addps	rA0, rC5	movaps	64-120(pA5, ldab), rA0	mulps	rB0, rA0	addps	rA0, rC6	movaps	64-120(pA5, ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC7	movaps	64-120(pA10, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC8	movaps	64-120(pA5,ldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC9	movaps	64-120(pA10), rA0	mulps	rB0, rA0	addps	rA0, rC10	movaps	64-120(pA10,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC11	movaps	64-120(pA10,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC12	mulps	64-120(pA5,ldab,8), rB0	addps	rB0, rC13#endif#if KB > 20	movaps	80-120(pA10,mldab5,2), rA0	movaps	80-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	80-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	80-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	80-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	80-120(pA5, mldab), rA0	mulps	rB0, rA0	addps	rA0, rC4	movaps	80-120(pA5), rA0	mulps	rB0, rA0	addps	rA0, rC5	movaps	80-120(pA5, ldab), rA0	mulps	rB0, rA0	addps	rA0, rC6	movaps	80-120(pA5, ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC7	movaps	80-120(pA10, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC8	movaps	80-120(pA5,ldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC9	movaps	80-120(pA10), rA0	mulps	rB0, rA0	addps	rA0, rC10	movaps	80-120(pA10,ldab), rA0	mulps	rB0, rA0	addps	rA0, rC11	movaps	80-120(pA10,ldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC12	mulps	80-120(pA5,ldab,8), rB0	addps	rB0, rC13#endif#if KB > 24	movaps	96-120(pA10,mldab5,2), rA0	movaps	96-120(pB0), rB0	mulps	rB0, rA0	addps	rA0, rC0	movaps	96-120(pA5, mldab,4), rA0	mulps	rB0, rA0	addps	rA0, rC1	movaps	96-120(pA10, mldab,8), rA0	mulps	rB0, rA0	addps	rA0, rC2	movaps	96-120(pA5, mldab,2), rA0	mulps	rB0, rA0	addps	rA0, rC3	movaps	96-120(pA5, mldab), rA0

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?