atl_dmm1x14x56_sse2pabc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,923 行 · 第 1/5 页

C
2,923
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664   #error "This kernel requires x86-64 assembly!"#endif#include "atlas_asm.h"#if !defined(MB)   #define MB 0#endif#if !defined(NB)   #define NB 0#endif#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if (KB != 56)   #error "KB must be 56!"#endif#if (NB/14)*14 != NB   #error "NB must be multiple of 14!"#endif/* * Integer register usage shown be these defines */#define pA      %rcx#define pA10	%rbx#define ldab	%rbp#define mldab	%rdx#define mldab5  %rax#define pB      %rdi#define pC      %rsi#define stM     %r9#define stN     %r11#define pfA	%r8#define pA5 	pA#define pB0	pB#define pAS     %r13#define ldc     %r10#define mldc    %r14#define ldc3    %r15/*       rax     used in 32/64 conversion */#define NBso	(KB*8)#define MBKBso  (MB*KB*8)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NB3so+NB3so)#define NB7so   (NB3so+NB4so)#define NB8so   (NB4so+NB4so)#define NB9so   (NB4so+NB5so)#define NB10so   (NB5so+NB5so)#define NB11so   (NB6so+NB5so)#define NB12so   (NB7so+NB5so)#define NB13so   (NB8so+NB5so)#define NB14so   (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0	%xmm0#define rB0	%xmm1#define rC0	%xmm2#define rC1	%xmm3#define rC2	%xmm4#define rC3	%xmm5#define rC4	%xmm6#define rC5	%xmm7#define rC6	%xmm8#define rC7	%xmm9#define rC8	%xmm10#define rC9	%xmm11#define rC10	%xmm12#define rC11	%xmm13#define rC12	%xmm14#define rC13	%xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1   mem#define prefB(mem) prefetcht0   mem#define prefC(mem) prefetchw	mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs */	movq	%rbp, -8(%rsp)	movq	%rbx, -16(%rsp)	movq	%r12, -32(%rsp)	movq	%r13, -40(%rsp)	movq	%r14, -48(%rsp)	movq	%r15, -56(%rsp)#define SOFF -64#ifdef BETAX   #define BOF -24	movlpd	%xmm1, BOF(%rsp)#endif/* *      pA already comes in right reg;  load stN, *      Initialize pB = B; pC = C; NBso = NB * sizeof; */	movq	%rsi, stN	movq	%rdi, %r12	movq	%r9, pB			prefB((pB))			prefB(64(pB))	movq	16(%rsp), pC			prefC((pC))/* *      stM = M/14; stN = N */#if MB != 0	movq	$MB, stM#else	movq	%r12, stM#endif/* *      convert ldc to 64 bits, and mul by size */	movl	24(%rsp), %eax	cltq	movq	%rax, ldc#ifdef DREAL	shl	$3, ldc#else	shl	$4, ldc#endif/* *      At this point, pA5 has pA, pB0 has pB, stN has N, stM has M; swap *      them so that we can reverse loops */        movq    pA5, mldc        movq    pB0, pA5        movq    mldc, pB0			prefB(128(pB))        movq    stN, mldc        movq    stM, stN        movq    mldc, stM/* *      pfA = pA + M*KBso */	movq	stM, pfA	imulq	$NBso, pfA	addq	pA5, pfA/* *      Calculate and store incCn = sizeof*(N*ldc - 1) */        movq    stM, mldc        imulq   ldc, mldc#ifdef DREAL        subq    $8, mldc#else        subq    $16, mldc#endif        movq    mldc, SOFF(%rsp)/* *      mldc = -ldc; ldc3 = ldc*3, pC = pC + ldc*2 */        movq    ldc, mldc        neg     mldc        lea     (ldc,ldc,2), ldc3        lea     (pC, ldc,2), pC	addq	$120, pA5	addq	$120, pB0	movq	$KB*8, ldab	movq	$-KB*5*8, mldab5	movq	$-KB*8, mldab	subq	mldab5, pA5	lea	KB*8(pA5, ldab,4), pA10	movq	pA10, pAS	movq	stM, %r12UNLOOP:#if NB == 0	movq	%r12, stM        sub     $14, stM        jz      UMLOOPCU#else        movq    $NB-14, stM#endif#if NB != 14UMLOOP:/* *      rC[0-13] = pC[0-13] * beta */	ALIGN16/*UKLOOP: */#ifdef BETA1	movapd	0-120(pA10,mldab5,2), rC0	movapd	0-120(pB0), rB0	mulpd	rB0, rC0	addsd	(pC,mldc,2), rC0	movapd	0-120(pA5, mldab,4), rC1	mulpd	rB0, rC1	addsd	(pC,mldc), rC1	movapd	0-120(pA10, mldab,8), rC2	mulpd	rB0, rC2	addsd	(pC), rC2	movapd	0-120(pA5, mldab,2), rC3	mulpd	rB0, rC3	addsd	(pC,ldc), rC3	movapd	0-120(pA5, mldab), rC4	mulpd	rB0, rC4	addsd	(pC,ldc,2), rC4	movapd	0-120(pA5), rC5	mulpd	rB0, rC5	addsd	(pC,ldc3), rC5	movapd	0-120(pA5, ldab), rC6	mulpd	rB0, rC6	addsd	(pC,ldc,4), rC6                                                addq    ldc, pC	movapd	0-120(pA5, ldab,2), rC7                                                lea     (pC,ldc3,2), pC	mulpd	rB0, rC7	addsd	(pC,mldc,2), rC7	movapd	0-120(pA10, mldab,2), rC8	mulpd	rB0, rC8	addsd	(pC,mldc), rC8	movapd	0-120(pA5,ldab,4), rC9	mulpd	rB0, rC9	addsd	(pC), rC9	movapd	0-120(pA10), rC10	mulpd	rB0, rC10	addsd	(pC,ldc), rC10	movapd	0-120(pA10,ldab), rC11	mulpd	rB0, rC11	addsd	(pC,ldc,2), rC11	movapd	0-120(pA10,ldab,2), rC12	mulpd	rB0, rC12	addsd	(pC,ldc3), rC12	movapd	0-120(pA5,ldab,8), rC13	mulpd	rB0, rC13	addsd	(pC,ldc,4), rC13#elif defined(BETA0)	movapd	0-120(pA10,mldab5,2), rC0	movapd	0-120(pB0), rC13	mulpd	rC13, rC0	movapd	0-120(pA5, mldab,4), rC1	mulpd	rC13, rC1	movapd	0-120(pA10, mldab,8), rC2	mulpd	rC13, rC2	movapd	0-120(pA5, mldab,2), rC3	mulpd	rC13, rC3	movapd	0-120(pA5, mldab), rC4	mulpd	rC13, rC4	movapd	0-120(pA5), rC5	mulpd	rC13, rC5	movapd	0-120(pA5, ldab), rC6	mulpd	rC13, rC6	movapd	0-120(pA5, ldab,2), rC7	mulpd	rC13, rC7	movapd	0-120(pA10, mldab,2), rC8	mulpd	rC13, rC8	movapd	0-120(pA5,ldab,4), rC9	mulpd	rC13, rC9	movapd	0-120(pA10), rC10	mulpd	rC13, rC10	movapd	0-120(pA10,ldab), rC11	mulpd	rC13, rC11	movapd	0-120(pA10,ldab,2), rC12	mulpd	rC13, rC12	mulpd 	0-120(pA5,ldab,8), rC13#else	movsd	BOF(%rsp), rC0	movapd	rC0, rC1	movapd	rC0, rC2	movapd	rC0, rC3	movapd	rC0, rC4	movapd	rC0, rC5	movapd	rC0, rC6	movapd	rC0, rC7	movapd	rC0, rC8	movapd	rC0, rC9	movapd	rC0, rC10	movapd	rC0, rC11	movapd	rC0, rC12	movapd	rC0, rC13	mulsd	(pC,mldc,2), rC0	mulsd	(pC,mldc), rC1	mulsd	(pC), rC2	mulsd	(pC,ldc), rC3	mulsd	(pC,ldc,2), rC4	mulsd	(pC,ldc3), rC5	mulsd	(pC,ldc,4), rC6                                                add     ldc, pC                                                lea     (pC,ldc3,2), pC	mulsd	(pC,mldc,2), rC7	mulsd	(pC,mldc), rC8	mulsd	(pC), rC9	mulsd	(pC,ldc), rC10	mulsd	(pC,ldc,2), rC11	mulsd	(pC,ldc3), rC12	mulsd	(pC,ldc,4), rC13	movapd	0-120(pA10,mldab5,2), rA0	movapd	0-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0                                        lea     (pC,mldc,8), pC	movapd	0-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1                                        addq    ldc, pC	movapd	0-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	0-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	0-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	0-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	0-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	0-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	0-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	0-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	0-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	0-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	0-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	0-120(pA5,ldab,8), rB0	addpd	rB0, rC13#endif	movapd	16-120(pA10,mldab5,2), rA0	movapd	16-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0#ifdef BETA1                                        lea     (pC,mldc,8), pC#endif	movapd	16-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1#ifdef BETA1                                        addq    ldc, pC#endif	movapd	16-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	16-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	16-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	16-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	16-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	16-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	16-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	16-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	16-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	16-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	16-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	16-120(pA5,ldab,8), rB0	addpd	rB0, rC13	movapd	32-120(pA10,mldab5,2), rA0	movapd	32-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	32-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	32-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	32-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	32-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	32-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	32-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	32-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	32-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	32-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	32-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	32-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	32-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	32-120(pA5,ldab,8), rB0	addpd	rB0, rC13	movapd	48-120(pA10,mldab5,2), rA0	movapd	48-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	48-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	48-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	48-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	48-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	48-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	48-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	48-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	48-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	48-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	48-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	48-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	48-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	48-120(pA5,ldab,8), rB0	addpd	rB0, rC13#ifndef DREAL						pref2((pfA))						pref2(64(pfA))#endif	movapd	64-120(pA10,mldab5,2), rA0	movapd	64-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	64-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	64-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	64-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	64-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	64-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5	movapd	64-120(pA5, ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC6	movapd	64-120(pA5, ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC7	movapd	64-120(pA10, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC8	movapd	64-120(pA5,ldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC9	movapd	64-120(pA10), rA0	mulpd	rB0, rA0	addpd	rA0, rC10	movapd	64-120(pA10,ldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC11	movapd	64-120(pA10,ldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC12	mulpd	64-120(pA5,ldab,8), rB0	addpd	rB0, rC13	movapd	80-120(pA10,mldab5,2), rA0	movapd	80-120(pB0), rB0	mulpd	rB0, rA0	addpd	rA0, rC0	movapd	80-120(pA5, mldab,4), rA0	mulpd	rB0, rA0	addpd	rA0, rC1	movapd	80-120(pA10, mldab,8), rA0	mulpd	rB0, rA0	addpd	rA0, rC2	movapd	80-120(pA5, mldab,2), rA0	mulpd	rB0, rA0	addpd	rA0, rC3	movapd	80-120(pA5, mldab), rA0	mulpd	rB0, rA0	addpd	rA0, rC4	movapd	80-120(pA5), rA0	mulpd	rB0, rA0	addpd	rA0, rC5

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?